Section 1 - Overview of available data

Load data

tgc_all <- read_csv("../TGC_data_031222.csv")
## Rows: 13000 Columns: 105
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (31): TGC_sangerlane, Data Accession, Study Accession, Sample Accession,...
## dbl (66): CORE MATCHES, % CORE FAMILIES, % NON-CORE, GENOME LENGTH, N50, NO....
## lgl  (8): MDR, cipNS, cipR, XDR, aziR, untargetedSurveillance, assumedAcute,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# total genomes
nrow(tgc_all)
## [1] 13000

Genome size analysis

# genomes excluded due to size
table(tgc_all$exclude_assembly)
## 
## FALSE  TRUE 
## 12965    35
tgc_all <- tgc_all %>% 
  mutate(IncHI1=grepl("IncHI1",`Inc Types`))

table(tgc_all$IncHI1, tgc_all$exclude_assembly)
##        
##         FALSE  TRUE
##   FALSE 12007    32
##   TRUE    958     3
genome_sizes <- tgc_all %>% 
  ggplot( aes(y=`GENOME LENGTH`, x=IncHI1)) + 
      geom_violin() +
      theme_bw() + 
      geom_hline(yintercept = 4.5E6, linetype="dashed", color="blue") + 
      geom_hline(yintercept = 5.5E6, linetype="dashed", color="blue") +
      labs(x="IncHI1 detected", y="Genome length (bp)", subtitle="a) All genomes, pre-filter") + 
      scale_x_discrete(labels=c(paste0("yes (n=",sum(tgc_all$IncHI1),")"), paste0("no (n=",sum(!tgc_all$IncHI1),")")))

genome_sizes

genome_sizes_included <- tgc_all %>% 
  filter(exclude_assembly != TRUE) %>%
  ggplot( aes(y=`GENOME LENGTH`, x=IncHI1)) + 
      geom_violin() +
      theme_bw() + 
      labs(x="IncHI1 detected", y="Genome length (bp)", subtitle="b) Included genomes") + 
      scale_x_discrete(labels=c(paste0("yes (n=",sum(tgc_all$IncHI1[!tgc_all$exclude_assembly]),")"), paste0("no (n=",sum(!tgc_all$IncHI1[!tgc_all$exclude_assembly]),")")))

genome_sizes_included

Fig S1 - genome size

pdf(paste0("SuppFig1_genomeSize_by_inc.pdf"), width=6, height=4)
genome_sizes | genome_sizes_included
dev.off()
## quartz_off_screen 
##                 2
png(paste0("SuppFig1_genomeSize_by_inc.png"), width=6, height=4, units="in", res=400)
genome_sizes | genome_sizes_included
dev.off()
## quartz_off_screen 
##                 2

Table 1 - summary of contributing papers

paper_summary <- tgc_all %>%
  mutate(source=ifelse(is.na(PMID), `Isolating Lab`, PMID)) %>%
    group_by(source)%>% 
    summarize(Total =n(),
              `Representative surveillance 2010-2020` =sum(Year >= 2010 & assumedAcute & untargetedSurveillance), 
              `Travel Associated`=sum(`Travel Associated`=="Yes" & Year >= 2010 & assumedAcute & untargetedSurveillance)) %>%
  adorn_totals("row")

paper_summary
##                     source Total Representative surveillance 2010-2020
##  10.1101/2022.09.01.506167    57                                     0
##  10.1101/2022.10.03.510628   463                                     0
##                   11677608     1                                     0
##                   12644504     1                                     0
##                   18660809     4                                     0
##                   25392358    22                                     0
##                   25428145     2                                     0
##                   25961941  1736                                   733
##                   26411565    30                                     0
##                   26974227    77                                    77
##                   27069781   489                                   432
##                   27331909     1                                     1
##                   27657909   128                                   111
##                   27703135    99                                    43
##                   28060810    44                                     0
##                   28280021     3                                     0
##                   28705963     2                                     0
##                   28931025    64                                    59
##                   29051234     1                                     0
##                   29136410     1                                     0
##                   29216342     5                                     4
##                   29255729   107                                     0
##                   29463654   100                                     0
##                   29616895     1                                     0
##                   29684021   192                                   169
##                   30236166    30                                     0
##                   30425150   536                                     0
##                   30504848   249                                   209
##                   31225619    39                                    39
##                   31513580   107                                    99
##                   31665304    94                                    94
##                   31730615    12                                     0
##                   31872221     2                                     0
##                   32003431   194                                     0
##                   32106221   202                                   147
##                   32119918     1                                     0
##                   32217683     5                                     0
##                   32253142     1                                     0
##                   32732230     1                                     0
##                   32883020    27                                    27
##                   33079054     7                                     7
##                   33085725   116                                     0
##                   33347558    29                                     0
##                   33496224    15                                    15
##                   33515460    66                                    66
##                   33593966    80                                    80
##                   33651791     8                                     0
##                   33704480    58                                    58
##                   33965548     2                                     0
##                   34223059     4                                     0
##                   34370659   631                                   604
##                   34463736   262                                   262
##                   34515028   136                                    88
##                   34529660    77                                     0
##                   34543095   116                                   116
##                   34626469    92                                    92
##                   34812716    16                                    14
##                   35344544    41                                     0
##                   35750070  3402                                  3390
##                   35767580   203                                    90
##                   35999186   190                                   190
##                   36026470    22                                    14
##                   36094088   202                                   174
##                        ESR    99                                    97
##                        IPA    23                                    23
##                        MLW    20                                    20
##                        NCD   281                                   281
##                        RBC    51                                    26
##                    STRATAA   732                                   707
##                      USCDC   889                                   850
##                      Total 13000                                  9508
##  Travel Associated
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                248
##                  0
##                  0
##                356
##                  0
##                  0
##                 43
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                  3
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                 91
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                 12
##                  0
##                  0
##                  0
##                 58
##                  0
##                  0
##                584
##                  0
##                  0
##                  0
##                107
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                  0
##                  1
##                 52
##                 17
##                  0
##                 13
##                  0
##                  0
##                712
##               2297
write.csv(paper_summary, file=paste0("Table1_sourcePublication_summary.csv"), row.names=F)

exclude genomes on basis of size

tgc_all <- tgc_all %>% filter(exclude_assembly != TRUE)

Overview of dataset for text

# total high quality genomes
nrow(tgc_all)
## [1] 12965
# number of countries
length(table(tgc_all$Country_Origin))
## [1] 110
# assumed acute cases
table(tgc_all$assumedAcute)
## 
## FALSE  TRUE 
##   134 12831
# non-targeted
tgc_all %>% filter(startsWith(`Purpose of Sampling`, "Non")) %>% summarise(n=n())
## # A tibble: 1 × 1
##       n
##   <int>
## 1 11086
# targeted
tgc_all %>% filter(startsWith(`Purpose of Sampling`, "Targeted")) %>% summarise(n=n())
## # A tibble: 1 × 1
##       n
##   <int>
## 1  1862
# purpose not provided
tgc_all %>% filter(`Purpose of Sampling` == "Not Provided") %>% summarise(n=n())
## # A tibble: 1 × 1
##       n
##   <int>
## 1    17

Table S3 - samples and sources

source_vs_health <- table(tgc_all$Source,tgc_all$`Host Health State`)

source_vs_health <- rbind(source_vs_health, Total=colSums(source_vs_health))

source_vs_health <- source_vs_health[c("Blood", "Stool", "Urine", "Gallbladder", "Environment", "Other", "Not Provided", "Total"),c("Symptomatic", "Asymptomatic Carrier", "Not Provided")]

write.csv(source_vs_health, file=paste0("SuppTable3_health_vs_source_excludeFails.csv"))

Table 2 - summary of genomes by region

region_summary <- tgc_all %>%
    group_by(Region)%>% 
    summarize(region_total =n(),
              post2010_rep =sum(Year >= 2010 & assumedAcute & untargetedSurveillance), 
              travel_post2010=sum(`Travel Associated`=="Yes" & Year >= 2010 & assumedAcute & untargetedSurveillance), 
              
              ) %>%
  adorn_totals("row") %>%
  mutate(travel_post2010_pc = paste0(travel_post2010, " (", 
                                          round(travel_post2010/post2010_rep*100,1), "%)")
  ) %>% 
  select(Region, region_total, post2010_rep, travel_post2010_pc) %>% 
  rename(`Total genomes`=region_total, `Representative cases from 2010`=post2010_rep, `Travel (%) amongst representative cases from 2010`=travel_post2010_pc) %>%
  mutate(Region=replace(Region, is.na(Region), "Unknown"))

region_summary
##                     Region Total genomes Representative cases from 2010
##  Australia and New Zealand            57                             57
##                  Caribbean            20                             20
##            Central America           103                            100
##             Eastern Africa          1106                            830
##               Eastern Asia            12                              3
##             Eastern Europe             3                              1
##                  Melanesia           232                             37
##                 Micronesia             4                              1
##              Middle Africa            59                             21
##            Northern Africa            41                              6
##           Northern America           167                            140
##            Northern Europe           109                            105
##                  Polynesia           324                            262
##              South America           367                            105
##         South-eastern Asia          1140                            584
##            Southern Africa           317                            286
##              Southern Asia          8231                           6623
##            Southern Europe            10                              6
##             Western Africa           384                            267
##               Western Asia            47                             21
##             Western Europe             7                              3
##                    Unknown           225                              0
##                      Total         12965                           9478
##  Travel (%) amongst representative cases from 2010
##                                             0 (0%)
##                                          20 (100%)
##                                         100 (100%)
##                                          49 (5.9%)
##                                           3 (100%)
##                                           1 (100%)
##                                         30 (81.1%)
##                                           1 (100%)
##                                          6 (28.6%)
##                                           6 (100%)
##                                           2 (1.4%)
##                                             0 (0%)
##                                         45 (17.2%)
##                                           5 (4.8%)
##                                         72 (12.3%)
##                                           2 (0.7%)
##                                       1878 (28.4%)
##                                           6 (100%)
##                                         34 (12.7%)
##                                          21 (100%)
##                                           3 (100%)
##                                           0 (NaN%)
##                                       2284 (24.1%)
write.csv(region_summary, file=paste0("Table2_region_summary.csv"), row.names=F)

Table S4 - country breakdown

country_summary <- tgc_all %>%
    group_by(Region, Country_Origin)%>% 
    summarize(n =n(),
              travel = sum(`Travel Associated`=="Yes"),
              untargeted_from2010 =sum(Year >= 2010 & assumedAcute & untargetedSurveillance), 
              travel_untargeted_from2010 = sum(`Travel Associated`=="Yes" & Year >= 2010 & assumedAcute & untargetedSurveillance), 
              ) %>%
    adorn_totals("row") %>%
    mutate(travel_pc = paste0(travel, " (", round(travel/n*100,1), "%)")) %>%
    mutate(travel_untargeted_from2010_pc = paste0(travel_untargeted_from2010, " (", round(travel_untargeted_from2010/untargeted_from2010*100,1), "%)")) %>%
  select(Region, Country_Origin, n, travel_pc, untargeted_from2010, travel_untargeted_from2010_pc) %>% 
  rename(`Total genomes`=n, 
         `Travel (%)`=travel_pc,
         `Representative cases from 2010`=untargeted_from2010, 
         `Travel (%) amongst representative cases from 2010`=travel_untargeted_from2010_pc) %>%
  mutate(Region=replace(Region, is.na(Region), "Unknown"))
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
write.csv(country_summary, file=paste0("SuppTable4_country_summary.csv"), row.names=F)

country level counts for text

country_count_by_region <- table(paste0(tgc_all$Region, ": ", tgc_all$Country_Origin))

sum(country_count_by_region>=20)
## [1] 36
sum(country_count_by_region[country_count_by_region>=20])-225 # subtract the 'unknown' category
## [1] 12410
(sum(country_count_by_region[country_count_by_region>=20])-225)/nrow(tgc_all)
## [1] 0.9571924
sum(country_count_by_region>=100)
## [1] 21
sum(country_count_by_region[country_count_by_region>=100])-225 # subtract the 'unknown' category
## [1] 11762
(sum(country_count_by_region[country_count_by_region>=100])-225)/nrow(tgc_all)
## [1] 0.9072117

travel summary for text

table(tgc_all$`Travel Associated`)
## 
##           No Not Provided          Yes 
##         4736         4848         3381
# labs contirbuting travel-associated isolates
tgc_all %>% group_by(`Isolating Lab`, `Travel Associated`) %>% summarise (n=n()) %>% filter(`Travel Associated`=="Yes")
## `summarise()` has grouped output by 'Isolating Lab'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 3
## # Groups:   Isolating Lab [14]
##    `Isolating Lab` `Travel Associated`     n
##    <chr>           <chr>               <int>
##  1 CDC             Yes                   749
##  2 ESR             Yes                   144
##  3 IDS             Yes                    12
##  4 IPA             Yes                   116
##  5 KDC             Yes                     8
##  6 MDU             Yes                   490
##  7 NCD             Yes                    13
##  8 NDJ             Yes                   104
##  9 NIP             Yes                     1
## 10 PHE             Yes                  1740
## 11 TDC             Yes                     1
## 12 UBS             Yes                     1
## 13 USF             Yes                     1
## 14 WHP             Yes                     1
# country of origin for travel-associated isolates
tgc_all %>% group_by(Country_Origin, `Travel Associated`) %>% summarise (n=n()) %>% filter(`Travel Associated`=="Yes") %>% filter(n>35)
## `summarise()` has grouped output by 'Country_Origin'. You can override using
## the `.groups` argument.
## # A tibble: 12 × 3
## # Groups:   Country_Origin [12]
##    Country_Origin   `Travel Associated`     n
##    <chr>            <chr>               <int>
##  1 Bangladesh       Yes                   264
##  2 Chile            Yes                    49
##  3 Fiji             Yes                   102
##  4 India            Yes                  1241
##  5 Indonesia        Yes                    39
##  6 Mexico           Yes                    60
##  7 Nepal            Yes                    39
##  8 Nigeria          Yes                    42
##  9 Not Provided     Yes                   193
## 10 Pakistan         Yes                   783
## 11 Papua New Guinea Yes                    45
## 12 Samoa            Yes                    87

define representative subsets for analysis - counts for text

# surveillance-ready samples only

tgc_meta <- tgc_all %>%
  filter(assumedAcute == T, 
         untargetedSurveillance == T,
         Country_Origin != "Not Provided")

nrow(tgc_meta)
## [1] 10726
# surveillance-ready samples, known years, 2010 onwards
tgc_from2010 <- tgc_all %>%
  filter(assumedAcute == T, 
         untargetedSurveillance == T,
         Country_Origin != "Not Provided",
         Year >=2010)

nrow(tgc_from2010)
## [1] 9478
nrow(tgc_from2010)/nrow(tgc_meta)
## [1] 0.8836472

Section 2 - Geographic distribution of genotypes

Table S5: genotype prevalence per region, 2010-2020

region_N_from2010 <- tgc_from2010 %>%
  group_by(Region)%>% 
  summarize(N =n())

tgc_from2010_regionFreq_allGeno <- tgc_from2010 %>%
  group_by(Region, Final_genotype)%>% 
  summarize(x =n()) %>%
  left_join(region_N_from2010) %>% # region counts
  mutate(p = x/N) %>% # proportion per region
  mutate(se = sqrt(p*(1-p)/N)) %>%
  mutate(p_lowerCI = p-1.96*se, p_upperCI = p+1.96*se) %>%
  mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
  mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
  select(-se)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## Joining, by = "Region"
# annual counts/freqs
region_N_annual <- tgc_from2010 %>%
  group_by(Region, Year)%>% 
  summarize(N =n())
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
tgc_from2010_regionFreq_allGeno_annual <- tgc_from2010 %>%
  group_by(Region, Final_genotype, Year)%>% 
  summarize(x =n()) %>%
  left_join(region_N_annual) %>% # regional annual
  mutate(p = x/N) %>% # proportion per region
  mutate(se = sqrt(p*(1-p)/N)) %>%
  mutate(p_lowerCI = p-1.96*se, p_upperCI = p+1.96*se) %>%
  mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
  mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
  select(-se)
## `summarise()` has grouped output by 'Region', 'Final_genotype'. You can
## override using the `.groups` argument.
## Joining, by = c("Region", "Year")
# merge annual counts with total counts into one table for printing
genotypes_by_region <- tgc_from2010_regionFreq_allGeno %>% 
  mutate(Year="all") %>% 
  bind_rows(tgc_from2010_regionFreq_allGeno_annual) %>% 
  arrange(Region, Year, Final_genotype) %>%
  rename(Genotype=Final_genotype) %>%
  rename(proportion=p) %>%
  relocate(Year, .after=Genotype)
write.csv(genotypes_by_region, file=paste0("SuppTable5_genotypeFreqs_byRegion_2010-2020.csv"), row.names=F)

Table S6: genotype prevalence per country, 2010-2020

country_N_from2010 <- tgc_from2010 %>%
  group_by(Country_Origin)%>% 
  summarize(N =n())

tgc_from2010_countryFreq_allGeno <- tgc_from2010 %>%
  group_by(Region, Country_Origin, Final_genotype)%>% 
  summarize(x =n()) %>%
  left_join(country_N_from2010) %>% # country counts
  mutate(p = x/N) %>% # proportion per region
  mutate(se = sqrt(p*(1-p)/N)) %>%
  mutate(p_lowerCI = p-1.96*se,
         p_upperCI = p+1.96*se) %>%
  mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
  mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
  select(-se)
## `summarise()` has grouped output by 'Region', 'Country_Origin'. You can
## override using the `.groups` argument.
## Joining, by = "Country_Origin"
# annual counts/freqs
country_N_annual <- tgc_from2010 %>%
  group_by(Country_Origin, Year)%>% 
  summarize(N =n())
## `summarise()` has grouped output by 'Country_Origin'. You can override using
## the `.groups` argument.
tgc_from2010_countryFreq_allGeno_annual <- tgc_from2010 %>%
  group_by(Region, Country_Origin, Final_genotype, Year)%>% 
  summarize(x =n()) %>%
  left_join(country_N_annual) %>% # country annual
  mutate(p = x/N) %>% # proportion per country
  mutate(se = sqrt(p*(1-p)/N)) %>%
  mutate(p_lowerCI = p-1.96*se, p_upperCI = p+1.96*se) %>%
  mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
  mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
  select(-se)
## `summarise()` has grouped output by 'Region', 'Country_Origin',
## 'Final_genotype'. You can override using the `.groups` argument.
## Joining, by = c("Country_Origin", "Year")
# merge annual counts with total counts into one table for printing
genotypes_by_country <- tgc_from2010_countryFreq_allGeno %>% 
  mutate(Year="all") %>% 
  bind_rows(tgc_from2010_countryFreq_allGeno_annual) %>% 
  arrange(Region, Country_Origin, Year, Final_genotype) %>%
  rename(Genotype=Final_genotype) %>%
  rename(proportion=p) %>%
  relocate(Year, .after=Genotype)
write.csv(genotypes_by_country, file=paste0("SuppTable6_genotypeFreqs_byCountry_2010-2020.csv"), row.names=F)

genotype prevalence numbers for text - H58 genotypes by region

# H58 regional stats
tgc_from2010_regionFreq_h58 <- tgc_from2010 %>%
  mutate(h58 = startsWith(Final_genotype,"4.3.1")) %>%
  group_by(Region, h58)%>% 
  summarize(x =n()) %>%
  left_join(region_N_from2010) %>% # country counts
  mutate(p = x/N) %>% # proportion per region
  mutate(se = sqrt(p*(1-p)/N)) %>%
  mutate(p_lowerCI = p-1.96*se,
         p_upperCI = p+1.96*se) %>%
  mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
  mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
  select(-se)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## Joining, by = "Region"
tgc_from2010_regionFreq_h58
## # A tibble: 34 × 7
## # Groups:   Region [21]
##    Region                    h58       x     N      p p_lowerCI p_upperCI
##    <chr>                     <lgl> <int> <int>  <dbl>     <dbl>     <dbl>
##  1 Australia and New Zealand FALSE    51    57 0.895     0.815     0.974 
##  2 Australia and New Zealand TRUE      6    57 0.105     0.0256    0.185 
##  3 Caribbean                 FALSE    20    20 1         1         1     
##  4 Central America           FALSE    97   100 0.97      0.937     1     
##  5 Central America           TRUE      3   100 0.03      0         0.0634
##  6 Eastern Africa            FALSE    56   830 0.0675    0.0504    0.0845
##  7 Eastern Africa            TRUE    774   830 0.933     0.915     0.950 
##  8 Eastern Asia              FALSE     2     3 0.667     0.133     1     
##  9 Eastern Asia              TRUE      1     3 0.333     0         0.867 
## 10 Eastern Europe            FALSE     1     1 1         1         1     
## # … with 24 more rows
# H58 lineage 1 regional stats
tgc_from2010_regionFreq_h58L1 <- tgc_from2010 %>%
  mutate(h58L1 = startsWith(Final_genotype,"4.3.1.1")) %>%
  group_by(Region, h58L1)%>% 
  summarize(x =n()) %>%
  left_join(region_N_from2010) %>% # country counts
  mutate(p = x/N) %>% # proportion per region
  mutate(se = sqrt(p*(1-p)/N)) %>%
  mutate(p_lowerCI = p-1.96*se,
         p_upperCI = p+1.96*se) %>%
  mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
  mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
  select(-se)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## Joining, by = "Region"
tgc_from2010_regionFreq_h58L1
## # A tibble: 31 × 7
## # Groups:   Region [21]
##    Region                    h58L1     x     N      p p_lowerCI p_upperCI
##    <chr>                     <lgl> <int> <int>  <dbl>     <dbl>     <dbl>
##  1 Australia and New Zealand FALSE    55    57 0.965      0.917    1     
##  2 Australia and New Zealand TRUE      2    57 0.0351     0        0.0829
##  3 Caribbean                 FALSE    20    20 1          1        1     
##  4 Central America           FALSE    98   100 0.98       0.953    1     
##  5 Central America           TRUE      2   100 0.02       0        0.0474
##  6 Eastern Africa            FALSE   179   830 0.216      0.188    0.244 
##  7 Eastern Africa            TRUE    651   830 0.784      0.756    0.812 
##  8 Eastern Asia              FALSE     3     3 1          1        1     
##  9 Eastern Europe            FALSE     1     1 1          1        1     
## 10 Melanesia                 FALSE    37    37 1          1        1     
## # … with 21 more rows
# H58 lineage 2 regional stats
tgc_from2010_regionFreq_h58L2 <- tgc_from2010 %>%
  mutate(h58L2 = startsWith(Final_genotype,"4.3.1.2")) %>%
  group_by(Region, h58L2)%>% 
  summarize(x =n()) %>%
  left_join(region_N_from2010) %>% # country counts
  mutate(p = x/N) %>% # proportion per region
  mutate(se = sqrt(p*(1-p)/N)) %>%
  mutate(p_lowerCI = p-1.96*se,
         p_upperCI = p+1.96*se) %>%
  mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
  mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
  select(-se)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## Joining, by = "Region"
tgc_from2010_regionFreq_h58L2
## # A tibble: 34 × 7
## # Groups:   Region [21]
##    Region                    h58L2     x     N      p p_lowerCI p_upperCI
##    <chr>                     <lgl> <int> <int>  <dbl>     <dbl>     <dbl>
##  1 Australia and New Zealand FALSE    53    57 0.930    0.864      0.996 
##  2 Australia and New Zealand TRUE      4    57 0.0702   0.00386    0.136 
##  3 Caribbean                 FALSE    20    20 1        1          1     
##  4 Central America           FALSE    99   100 0.99     0.970      1     
##  5 Central America           TRUE      1   100 0.01     0          0.0295
##  6 Eastern Africa            FALSE   707   830 0.852    0.828      0.876 
##  7 Eastern Africa            TRUE    123   830 0.148    0.124      0.172 
##  8 Eastern Asia              FALSE     2     3 0.667    0.133      1     
##  9 Eastern Asia              TRUE      1     3 0.333    0          0.867 
## 10 Eastern Europe            FALSE     1     1 1        1          1     
## # … with 24 more rows

genotype prevalence numbers for text - H58 genotypes by country

# H58 country stats
tgc_from2010_countryFreq_h58 <- tgc_from2010 %>%
  mutate(h58 = startsWith(Final_genotype,"4.3.1")) %>%
  group_by(Region, Country_Origin, h58)%>% 
  summarize(x =n()) %>%
  left_join(country_N_from2010) %>% # country counts
  mutate(p = x/N) %>% # proportion per country
  mutate(se = sqrt(p*(1-p)/N)) %>%
  mutate(p_lowerCI = p-1.96*se,
         p_upperCI = p+1.96*se) %>%
  mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
  mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
  select(-se)
## `summarise()` has grouped output by 'Region', 'Country_Origin'. You can
## override using the `.groups` argument.
## Joining, by = "Country_Origin"
tgc_from2010_countryFreq_h58
## # A tibble: 111 × 8
## # Groups:   Region, Country_Origin [82]
##    Region            Country_Origin h58       x     N      p p_lowerCI p_upperCI
##    <chr>             <chr>          <lgl> <int> <int>  <dbl>     <dbl>     <dbl>
##  1 Australia and Ne… Australia      FALSE     7    12 0.583      0.304    0.862 
##  2 Australia and Ne… Australia      TRUE      5    12 0.417      0.138    0.696 
##  3 Australia and Ne… New Zealand    FALSE    44    45 0.978      0.935    1     
##  4 Australia and Ne… New Zealand    TRUE      1    45 0.0222     0        0.0653
##  5 Caribbean         Dominican Rep… FALSE     6     6 1          1        1     
##  6 Caribbean         Haiti          FALSE    12    12 1          1        1     
##  7 Caribbean         Jamaica        FALSE     1     1 1          1        1     
##  8 Caribbean         Turks and Cai… FALSE     1     1 1          1        1     
##  9 Central America   El Salvador    FALSE    19    19 1          1        1     
## 10 Central America   Guatemala      FALSE    22    22 1          1        1     
## # … with 101 more rows
# H58 L1 country stats
tgc_from2010_countryFreq_h58L1 <- tgc_from2010 %>%
  mutate(h58L1 = startsWith(Final_genotype,"4.3.1.1")) %>%
  group_by(Region, Country_Origin, h58L1)%>% 
  summarize(x =n()) %>%
  left_join(country_N_from2010) %>% # country counts
  mutate(p = x/N) %>% # proportion per region
  mutate(se = sqrt(p*(1-p)/N)) %>%
  mutate(p_lowerCI = p-1.96*se,
         p_upperCI = p+1.96*se) %>%
  mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
  mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
  select(-se)
## `summarise()` has grouped output by 'Region', 'Country_Origin'. You can
## override using the `.groups` argument.
## Joining, by = "Country_Origin"
tgc_from2010_countryFreq_h58L1
## # A tibble: 107 × 8
## # Groups:   Region, Country_Origin [82]
##    Region             Country_Origin h58L1     x     N     p p_lowerCI p_upperCI
##    <chr>              <chr>          <lgl> <int> <int> <dbl>     <dbl>     <dbl>
##  1 Australia and New… Australia      FALSE    10    12 0.833     0.622     1    
##  2 Australia and New… Australia      TRUE      2    12 0.167     0         0.378
##  3 Australia and New… New Zealand    FALSE    45    45 1         1         1    
##  4 Caribbean          Dominican Rep… FALSE     6     6 1         1         1    
##  5 Caribbean          Haiti          FALSE    12    12 1         1         1    
##  6 Caribbean          Jamaica        FALSE     1     1 1         1         1    
##  7 Caribbean          Turks and Cai… FALSE     1     1 1         1         1    
##  8 Central America    El Salvador    FALSE    19    19 1         1         1    
##  9 Central America    Guatemala      FALSE    22    22 1         1         1    
## 10 Central America    Mexico         FALSE    57    58 0.983     0.949     1    
## # … with 97 more rows
# H58 L2 country stats
tgc_from2010_countryFreq_h58L2 <- tgc_from2010 %>%
  mutate(h58L2 = startsWith(Final_genotype,"4.3.1.2")) %>%
  group_by(Region, Country_Origin, h58L2)%>% 
  summarize(x =n()) %>%
  left_join(country_N_from2010) %>% # country counts
  mutate(p = x/N) %>% # proportion per region
  mutate(se = sqrt(p*(1-p)/N)) %>%
  mutate(p_lowerCI = p-1.96*se,
         p_upperCI = p+1.96*se) %>%
  mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
  mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
  select(-se)
## `summarise()` has grouped output by 'Region', 'Country_Origin'. You can
## override using the `.groups` argument.
## Joining, by = "Country_Origin"
tgc_from2010_countryFreq_h58L2
## # A tibble: 107 × 8
## # Groups:   Region, Country_Origin [82]
##    Region            Country_Origin h58L2     x     N      p p_lowerCI p_upperCI
##    <chr>             <chr>          <lgl> <int> <int>  <dbl>     <dbl>     <dbl>
##  1 Australia and Ne… Australia      FALSE     9    12 0.75     0.505      0.995 
##  2 Australia and Ne… Australia      TRUE      3    12 0.25     0.00500    0.495 
##  3 Australia and Ne… New Zealand    FALSE    44    45 0.978    0.935      1     
##  4 Australia and Ne… New Zealand    TRUE      1    45 0.0222   0          0.0653
##  5 Caribbean         Dominican Rep… FALSE     6     6 1        1          1     
##  6 Caribbean         Haiti          FALSE    12    12 1        1          1     
##  7 Caribbean         Jamaica        FALSE     1     1 1        1          1     
##  8 Caribbean         Turks and Cai… FALSE     1     1 1        1          1     
##  9 Central America   El Salvador    FALSE    19    19 1        1          1     
## 10 Central America   Guatemala      FALSE    22    22 1        1          1     
## # … with 97 more rows

genotype prevalence numbers for text

# Southern Asia
tgc_from2010_regionFreq_h58 %>% filter(Region=="Southern Asia") %>% filter(h58)
## # A tibble: 1 × 7
## # Groups:   Region [1]
##   Region        h58       x     N     p p_lowerCI p_upperCI
##   <chr>         <lgl> <int> <int> <dbl>     <dbl>     <dbl>
## 1 Southern Asia TRUE   4662  6623 0.704     0.693     0.715
tgc_from2010_countryFreq_h58 %>% filter(Region=="Southern Asia") %>% filter(h58)
## # A tibble: 6 × 8
## # Groups:   Region, Country_Origin [6]
##   Region        Country_Origin h58       x     N     p p_lowerCI p_upperCI
##   <chr>         <chr>          <lgl> <int> <int> <dbl>     <dbl>     <dbl>
## 1 Southern Asia Afghanistan    TRUE      5     5 1         1         1    
## 2 Southern Asia Bangladesh     TRUE    670  1591 0.421     0.397     0.445
## 3 Southern Asia India          TRUE   1655  2267 0.730     0.712     0.748
## 4 Southern Asia Nepal          TRUE    941  1275 0.738     0.714     0.762
## 5 Southern Asia Pakistan       TRUE   1390  1484 0.937     0.924     0.949
## 6 Southern Asia Sri Lanka      TRUE      1     1 1         1         1
tgc_from2010_countryFreq_h58L1 %>% filter(Region=="Southern Asia") %>% filter(h58L1)
## # A tibble: 5 × 8
## # Groups:   Region, Country_Origin [5]
##   Region        Country_Origin h58L1     x     N      p p_lowerCI p_upperCI
##   <chr>         <chr>          <lgl> <int> <int>  <dbl>     <dbl>     <dbl>
## 1 Southern Asia Afghanistan    TRUE      5     5 1         1         1     
## 2 Southern Asia Bangladesh     TRUE    546  1591 0.343     0.320     0.367 
## 3 Southern Asia India          TRUE    268  2267 0.118     0.105     0.132 
## 4 Southern Asia Nepal          TRUE     63  1275 0.0494    0.0375    0.0613
## 5 Southern Asia Pakistan       TRUE   1089  1484 0.734     0.711     0.756
tgc_from2010_countryFreq_h58L2 %>% filter(Region=="Southern Asia") %>% filter(h58L2)
## # A tibble: 4 × 8
## # Groups:   Region, Country_Origin [4]
##   Region        Country_Origin h58L2     x     N       p p_lowerCI p_upperCI
##   <chr>         <chr>          <lgl> <int> <int>   <dbl>     <dbl>     <dbl>
## 1 Southern Asia Bangladesh     TRUE      9  1591 0.00566   0.00197   0.00934
## 2 Southern Asia India          TRUE   1214  2267 0.536     0.515     0.556  
## 3 Southern Asia Nepal          TRUE    726  1275 0.569     0.542     0.597  
## 4 Southern Asia Pakistan       TRUE     47  1484 0.0317    0.0228    0.0406
genotypes_by_country %>% filter(Region=="Southern Asia") %>% filter(Genotype=="4.3.1") %>% filter(Year=="all")
## # A tibble: 5 × 9
## # Groups:   Region, Country_Origin [5]
##   Region        Country_Origin Genotype Year      x     N proportion p_lowerCI
##   <chr>         <chr>          <chr>    <chr> <int> <int>      <dbl>     <dbl>
## 1 Southern Asia Bangladesh     4.3.1    all       2  1591    0.00126    0     
## 2 Southern Asia India          4.3.1    all     168  2267    0.0741     0.0633
## 3 Southern Asia Nepal          4.3.1    all     152  1275    0.119      0.101 
## 4 Southern Asia Pakistan       4.3.1    all     254  1484    0.171      0.152 
## 5 Southern Asia Sri Lanka      4.3.1    all       1     1    1          1     
## # … with 1 more variable: p_upperCI <dbl>
genotypes_by_country %>% filter(Country_Origin=="Pakistan") %>% filter(Genotype=="4.3.1.1.P1")
## # A tibble: 6 × 9
## # Groups:   Region, Country_Origin [1]
##   Region        Country_Origin Genotype   Year      x     N proportion p_lowerCI
##   <chr>         <chr>          <chr>      <chr> <int> <int>      <dbl>     <dbl>
## 1 Southern Asia Pakistan       4.3.1.1.P1 2016      1    96     0.0104     0    
## 2 Southern Asia Pakistan       4.3.1.1.P1 2017     58   337     0.172      0.132
## 3 Southern Asia Pakistan       4.3.1.1.P1 2018    422   678     0.622      0.586
## 4 Southern Asia Pakistan       4.3.1.1.P1 2019     86   244     0.352      0.293
## 5 Southern Asia Pakistan       4.3.1.1.P1 2020     27    31     0.871      0.753
## 6 Southern Asia Pakistan       4.3.1.1.P1 all     594  1484     0.400      0.375
## # … with 1 more variable: p_upperCI <dbl>
tgc_from2010_countryFreq_allGeno %>% filter(Region=="Southern Asia") %>% filter(p>0.05) %>% filter(!startsWith(Final_genotype, "4.3.1"))
## # A tibble: 7 × 8
## # Groups:   Region, Country_Origin [3]
##   Region    Country_Origin Final_genotype     x     N      p p_lowerCI p_upperCI
##   <chr>     <chr>          <chr>          <int> <int>  <dbl>     <dbl>     <dbl>
## 1 Southern… Bangladesh     2.0.1            103  1591 0.0647    0.0526    0.0768
## 2 Southern… Bangladesh     2.3.3            274  1591 0.172     0.154     0.191 
## 3 Southern… Bangladesh     3.2.2            264  1591 0.166     0.148     0.184 
## 4 Southern… Bangladesh     3.3.2             93  1591 0.0585    0.0469    0.0700
## 5 Southern… India          2.5              190  2267 0.0838    0.0724    0.0952
## 6 Southern… India          3.3              150  2267 0.0662    0.0559    0.0764
## 7 Southern… Nepal          3.3.2            164  1275 0.129     0.110     0.147
# South east Asia
tgc_from2010_regionFreq_h58 %>% filter(Region=="South-eastern Asia") %>% filter(h58)
## # A tibble: 1 × 7
## # Groups:   Region [1]
##   Region             h58       x     N     p p_lowerCI p_upperCI
##   <chr>              <lgl> <int> <int> <dbl>     <dbl>     <dbl>
## 1 South-eastern Asia TRUE    276   584 0.473     0.432     0.513
tgc_from2010_regionFreq_h58L1 %>% filter(Region=="South-eastern Asia") %>% filter(h58L1)
## # A tibble: 1 × 7
## # Groups:   Region [1]
##   Region             h58L1     x     N     p p_lowerCI p_upperCI
##   <chr>              <lgl> <int> <int> <dbl>     <dbl>     <dbl>
## 1 South-eastern Asia TRUE    251   584 0.430     0.390     0.470
tgc_from2010_countryFreq_h58 %>% filter(Region=="South-eastern Asia") %>% filter(h58)
## # A tibble: 9 × 8
## # Groups:   Region, Country_Origin [9]
##   Region            Country_Origin h58       x     N       p p_lowerCI p_upperCI
##   <chr>             <chr>          <lgl> <int> <int>   <dbl>     <dbl>     <dbl>
## 1 South-eastern As… Cambodia       TRUE    216   221 0.977       0.958    0.997 
## 2 South-eastern As… Indonesia      TRUE      2    65 0.0308      0        0.0728
## 3 South-eastern As… Laos           TRUE      1    27 0.0370      0        0.108 
## 4 South-eastern As… Malaysia       TRUE      1     3 0.333       0        0.867 
## 5 South-eastern As… Myanmar        TRUE     46    49 0.939       0.872    1     
## 6 South-eastern As… Philippines    TRUE      1   206 0.00485     0        0.0143
## 7 South-eastern As… Singapore      TRUE      4     4 1           1        1     
## 8 South-eastern As… Thailand       TRUE      4     5 0.8         0.449    1     
## 9 South-eastern As… Viet Nam       TRUE      1     3 0.333       0        0.867
tgc_from2010_countryFreq_h58L1 %>% filter(Region=="South-eastern Asia") %>% filter(h58L1)
## # A tibble: 6 × 8
## # Groups:   Region, Country_Origin [6]
##   Region             Country_Origin h58L1     x     N      p p_lowerCI p_upperCI
##   <chr>              <chr>          <lgl> <int> <int>  <dbl>     <dbl>     <dbl>
## 1 South-eastern Asia Cambodia       TRUE    216   221 0.977      0.958     0.997
## 2 South-eastern Asia Laos           TRUE      1    27 0.0370     0         0.108
## 3 South-eastern Asia Myanmar        TRUE     29    49 0.592      0.454     0.729
## 4 South-eastern Asia Singapore      TRUE      1     4 0.25       0         0.674
## 5 South-eastern Asia Thailand       TRUE      3     5 0.6        0.171     1    
## 6 South-eastern Asia Viet Nam       TRUE      1     3 0.333      0         0.867
tgc_from2010_countryFreq_allGeno %>% filter(Region=="South-eastern Asia") %>% filter(p>0.05) %>% filter(!startsWith(Final_genotype, "4.3.1"))
## # A tibble: 18 × 8
## # Groups:   Region, Country_Origin [7]
##    Region   Country_Origin Final_genotype     x     N      p p_lowerCI p_upperCI
##    <chr>    <chr>          <chr>          <int> <int>  <dbl>     <dbl>     <dbl>
##  1 South-e… Indonesia      2.1               10    65 0.154     0.0661     0.242
##  2 South-e… Indonesia      3                 12    65 0.185     0.0903     0.279
##  3 South-e… Indonesia      3.1.2              8    65 0.123     0.0432     0.203
##  4 South-e… Indonesia      4.1               17    65 0.262     0.155      0.368
##  5 South-e… Laos           2.3.4              3    27 0.111     0          0.230
##  6 South-e… Laos           3.2.1              3    27 0.111     0          0.230
##  7 South-e… Laos           3.4               12    27 0.444     0.257      0.632
##  8 South-e… Laos           3.5.2              4    27 0.148     0.0141     0.282
##  9 South-e… Laos           4.1                2    27 0.0741    0          0.173
## 10 South-e… Malaysia       2.2.1              1     3 0.333     0          0.867
## 11 South-e… Malaysia       3                  1     3 0.333     0          0.867
## 12 South-e… Philippines    3                163   206 0.791     0.736      0.847
## 13 South-e… Philippines    3.2.1             23   206 0.112     0.0686     0.155
## 14 South-e… Philippines    4.1               16   206 0.0777    0.0411     0.114
## 15 South-e… Thailand       4.1                1     5 0.2       0          0.551
## 16 South-e… Timor-Leste    2.1                1     1 1         1          1    
## 17 South-e… Viet Nam       2.1                1     3 0.333     0          0.867
## 18 South-e… Viet Nam       3.2.1              1     3 0.333     0          0.867
# Western Asia
genotypes_by_region %>% filter(Region=="Western Asia") %>% filter(Year=="all")
## # A tibble: 8 × 8
## # Groups:   Region [1]
##   Region       Genotype Year      x     N proportion p_lowerCI p_upperCI
##   <chr>        <chr>    <chr> <int> <int>      <dbl>     <dbl>     <dbl>
## 1 Western Asia 0.1      all       1    21     0.0476    0          0.139
## 2 Western Asia 2        all       1    21     0.0476    0          0.139
## 3 Western Asia 2.2.1    all       1    21     0.0476    0          0.139
## 4 Western Asia 2.2.2    all       2    21     0.0952    0          0.221
## 5 Western Asia 3.3.1    all       1    21     0.0476    0          0.139
## 6 Western Asia 4.3.1    all       3    21     0.143     0          0.293
## 7 Western Asia 4.3.1.1  all       8    21     0.381     0.173      0.589
## 8 Western Asia 4.3.1.2  all       4    21     0.190     0.0225     0.358
genotypes_by_country %>% filter(Region=="Western Asia") %>% filter(Year=="all")
## # A tibble: 14 × 9
## # Groups:   Region, Country_Origin [6]
##    Region       Country_Origin   Genotype Year      x     N proportion p_lowerCI
##    <chr>        <chr>            <chr>    <chr> <int> <int>      <dbl>     <dbl>
##  1 Western Asia Iraq             0.1      all       1     8      0.125    0     
##  2 Western Asia Iraq             2        all       1     8      0.125    0     
##  3 Western Asia Iraq             3.3.1    all       1     8      0.125    0     
##  4 Western Asia Iraq             4.3.1.1  all       2     8      0.25     0     
##  5 Western Asia Iraq             4.3.1.2  all       3     8      0.375    0.0395
##  6 Western Asia Lebanon          2.2.2    all       2     4      0.5      0.0100
##  7 Western Asia Lebanon          4.3.1.1  all       1     4      0.25     0     
##  8 Western Asia Lebanon          4.3.1.2  all       1     4      0.25     0     
##  9 Western Asia Qatar            4.3.1.1  all       1     1      1        1     
## 10 Western Asia Saudi Arabia     4.3.1    all       2     3      0.667    0.133 
## 11 Western Asia Saudi Arabia     4.3.1.1  all       1     3      0.333    0     
## 12 Western Asia Syria            2.2.1    all       1     1      1        1     
## 13 Western Asia United Arab Emi… 4.3.1    all       1     4      0.25     0     
## 14 Western Asia United Arab Emi… 4.3.1.1  all       3     4      0.75     0.326 
## # … with 1 more variable: p_upperCI <dbl>
tgc_from2010_regionFreq_h58 %>% filter(Region=="Western Asia") %>% filter(h58)
## # A tibble: 1 × 7
## # Groups:   Region [1]
##   Region       h58       x     N     p p_lowerCI p_upperCI
##   <chr>        <lgl> <int> <int> <dbl>     <dbl>     <dbl>
## 1 Western Asia TRUE     15    21 0.714     0.521     0.908
# Eastern Africa
tgc_from2010_regionFreq_h58 %>% filter(Region=="Eastern Africa") %>% filter(h58)
## # A tibble: 1 × 7
## # Groups:   Region [1]
##   Region         h58       x     N     p p_lowerCI p_upperCI
##   <chr>          <lgl> <int> <int> <dbl>     <dbl>     <dbl>
## 1 Eastern Africa TRUE    774   830 0.933     0.915     0.950
genotypes_by_region %>% filter(Region=="Eastern Africa") %>% filter(Year=="all")
## # A tibble: 17 × 8
## # Groups:   Region [1]
##    Region         Genotype    Year      x     N proportion p_lowerCI p_upperCI
##    <chr>          <chr>       <chr> <int> <int>      <dbl>     <dbl>     <dbl>
##  1 Eastern Africa 1.2         all       1   830    0.00120  0          0.00356
##  2 Eastern Africa 2.2         all       6   830    0.00723  0.00147    0.0130 
##  3 Eastern Africa 2.2.2       all       3   830    0.00361  0          0.00770
##  4 Eastern Africa 2.4.1       all       9   830    0.0108   0.00380    0.0179 
##  5 Eastern Africa 2.5         all       4   830    0.00482  0.000108   0.00953
##  6 Eastern Africa 2.5.1       all       4   830    0.00482  0.000108   0.00953
##  7 Eastern Africa 2.5.2       all       4   830    0.00482  0.000108   0.00953
##  8 Eastern Africa 3           all       2   830    0.00241  0          0.00575
##  9 Eastern Africa 3.1         all       2   830    0.00241  0          0.00575
## 10 Eastern Africa 3.3.1       all       7   830    0.00843  0.00221    0.0147 
## 11 Eastern Africa 4.1         all       1   830    0.00120  0          0.00356
## 12 Eastern Africa 4.1.1       all      13   830    0.0157   0.00722    0.0241 
## 13 Eastern Africa 4.3.1.1     all       4   830    0.00482  0.000108   0.00953
## 14 Eastern Africa 4.3.1.1.EA1 all     647   830    0.780    0.751      0.808  
## 15 Eastern Africa 4.3.1.2     all       2   830    0.00241  0          0.00575
## 16 Eastern Africa 4.3.1.2.EA2 all      46   830    0.0554   0.0399     0.0710 
## 17 Eastern Africa 4.3.1.2.EA3 all      75   830    0.0904   0.0709     0.110
genotypes_by_country %>% filter(Region=="Eastern Africa") %>% filter(Year=="all") %>% filter(Genotype=="4.3.1.1.EA1")
## # A tibble: 6 × 9
## # Groups:   Region, Country_Origin [6]
##   Region         Country_Origin Genotype  Year      x     N proportion p_lowerCI
##   <chr>          <chr>          <chr>     <chr> <int> <int>      <dbl>     <dbl>
## 1 Eastern Africa Kenya          4.3.1.1.… all      86   149     0.577      0.498
## 2 Eastern Africa Malawi         4.3.1.1.… all     524   558     0.939      0.919
## 3 Eastern Africa Mozambique     4.3.1.1.… all       1     1     1          1    
## 4 Eastern Africa Tanzania       4.3.1.1.… all      15    18     0.833      0.661
## 5 Eastern Africa Uganda         4.3.1.1.… all       1    36     0.0278     0    
## 6 Eastern Africa Zimbabwe       4.3.1.1.… all      20    25     0.8        0.643
## # … with 1 more variable: p_upperCI <dbl>
genotypes_by_country %>% filter(Region=="Eastern Africa") %>% filter(Year=="all") %>% filter(Genotype=="4.3.1.2.EA3")
## # A tibble: 4 × 9
## # Groups:   Region, Country_Origin [4]
##   Region         Country_Origin Genotype  Year      x     N proportion p_lowerCI
##   <chr>          <chr>          <chr>     <chr> <int> <int>      <dbl>     <dbl>
## 1 Eastern Africa Burundi        4.3.1.2.… all       2     2      1        1     
## 2 Eastern Africa Kenya          4.3.1.2.… all      15   149      0.101    0.0524
## 3 Eastern Africa Rwanda         4.3.1.2.… all      23    27      0.852    0.718 
## 4 Eastern Africa Uganda         4.3.1.2.… all      35    36      0.972    0.919 
## # … with 1 more variable: p_upperCI <dbl>
genotypes_by_country %>% filter(Region=="Eastern Africa") %>% filter(Country_Origin=="Kenya") %>% filter(Year %in% c(2012, 2013, 2014, 2015, 2016)) %>% filter(Genotype=="4.3.1.1.EA1") %>% summarise(n=sum(x), N=sum(N), p=n/N)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## # A tibble: 1 × 5
## # Groups:   Region [1]
##   Region         Country_Origin     n     N     p
##   <chr>          <chr>          <int> <int> <dbl>
## 1 Eastern Africa Kenya             86   145 0.593
genotypes_by_country %>% filter(Region=="Eastern Africa") %>% filter(Country_Origin=="Kenya") %>% filter(Year %in% c(2017, 2018, 2019)) %>% filter(Genotype=="4.3.1.2.EA3") %>% summarise(n=sum(x), N=sum(N), p=n/N)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## # A tibble: 1 × 5
## # Groups:   Region [1]
##   Region         Country_Origin     n     N     p
##   <chr>          <chr>          <int> <int> <dbl>
## 1 Eastern Africa Kenya              4     4     1
# Southern Africa
tgc_from2010 %>% filter(Region=="Southern Africa") %>% group_by(Country_Origin) %>% summarise(n=n())
## # A tibble: 2 × 2
##   Country_Origin     n
##   <chr>          <int>
## 1 Namibia            1
## 2 South Africa     285
tgc_from2010 %>% filter(Region=="Southern Africa") %>% group_by(Country_Origin, Year) %>% summarise(n=n())
## `summarise()` has grouped output by 'Country_Origin'. You can override using
## the `.groups` argument.
## # A tibble: 9 × 3
## # Groups:   Country_Origin [2]
##   Country_Origin Year      n
##   <chr>          <chr> <int>
## 1 Namibia        2017      1
## 2 South Africa   2010      5
## 3 South Africa   2011      3
## 4 South Africa   2012      8
## 5 South Africa   2016      7
## 6 South Africa   2017     76
## 7 South Africa   2018     61
## 8 South Africa   2019     81
## 9 South Africa   2020     44
tgc_from2010 %>% filter(Country_Origin=="South Africa") %>% filter(Year %in% c(2017, 2018, 2019, 2020)) %>% summarise(n=n())
## # A tibble: 1 × 1
##       n
##   <int>
## 1   262
tgc_from2010 %>% filter(Country_Origin=="South Africa") %>% filter(Year %in% c(2017, 2018, 2019, 2020)) %>% mutate(h58=startsWith(Final_genotype,"4.3.1")) %>% group_by(h58) %>% summarise(n=n()) %>% mutate(p=n/sum(n), N=sum(n))
## # A tibble: 2 × 4
##   h58       n     p     N
##   <lgl> <int> <dbl> <int>
## 1 FALSE    80 0.305   262
## 2 TRUE    182 0.695   262
tgc_from2010 %>% filter(Country_Origin=="South Africa") %>% filter(Year %in% c(2010, 2011, 2012)) %>% mutate(h58=startsWith(Final_genotype,"4.3.1")) %>% group_by(h58) %>% summarise(n=n()) %>% mutate(p=n/sum(n), N=sum(n))
## # A tibble: 2 × 4
##   h58       n     p     N
##   <lgl> <int> <dbl> <int>
## 1 FALSE    12  0.75    16
## 2 TRUE      4  0.25    16
tgc_from2010 %>% filter(Country_Origin=="South Africa") %>% filter(Year %in% c(2017, 2018, 2019, 2020)) %>% group_by(Final_genotype) %>% summarise(n=n()) %>% mutate(p=n/sum(n), N=sum(n))
## # A tibble: 19 × 4
##    Final_genotype     n       p     N
##    <chr>          <int>   <dbl> <int>
##  1 1.1.2              3 0.0115    262
##  2 2.1                1 0.00382   262
##  3 2.2                4 0.0153    262
##  4 2.3.2              1 0.00382   262
##  5 2.4               20 0.0763    262
##  6 2.4.1             14 0.0534    262
##  7 2.5               11 0.0420    262
##  8 2.5.1              2 0.00763   262
##  9 3                  1 0.00382   262
## 10 3.1                2 0.00763   262
## 11 3.1.1              1 0.00382   262
## 12 3.2.1              1 0.00382   262
## 13 3.3                1 0.00382   262
## 14 3.3.1             18 0.0687    262
## 15 4.3.1.1            9 0.0344    262
## 16 4.3.1.1.EA1      168 0.641     262
## 17 4.3.1.2            1 0.00382   262
## 18 4.3.1.2.1          3 0.0115    262
## 19 4.3.1.2.EA3        1 0.00382   262
# Western Africa
genotypes_by_region %>% filter(Region=="Western Africa") %>% filter(Year=="all")
## # A tibble: 11 × 8
## # Groups:   Region [1]
##    Region         Genotype Year      x     N proportion p_lowerCI p_upperCI
##    <chr>          <chr>    <chr> <int> <int>      <dbl>     <dbl>     <dbl>
##  1 Western Africa 0.0.1    all       2   267    0.00749   0          0.0178
##  2 Western Africa 0.0.3    all       7   267    0.0262    0.00705    0.0454
##  3 Western Africa 2.1      all       2   267    0.00749   0          0.0178
##  4 Western Africa 2.2      all      17   267    0.0637    0.0344     0.0930
##  5 Western Africa 2.3.1    all      15   267    0.0562    0.0286     0.0838
##  6 Western Africa 2.3.2    all      37   267    0.139     0.0971     0.180 
##  7 Western Africa 3.1      all       1   267    0.00375   0          0.0111
##  8 Western Africa 3.1.1    all     172   267    0.644     0.587      0.702 
##  9 Western Africa 3.3      all       1   267    0.00375   0          0.0111
## 10 Western Africa 4.1      all      10   267    0.0375    0.0147     0.0602
## 11 Western Africa 4.1.1    all       3   267    0.0112    0          0.0239
tgc_all %>% filter(Region=="Western Africa") %>% filter(Final_genotype=="3.1.1") %>% group_by(Year, Country_Origin) %>% summarise(n=n()) %>% arrange(Country_Origin, Year)
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
## # A tibble: 37 × 3
## # Groups:   Year [16]
##    Year  Country_Origin     n
##    <chr> <chr>          <int>
##  1 2002  Benin              1
##  2 2004  Benin              2
##  3 2009  Benin              1
##  4 2006  Burkina Faso       3
##  5 2012  Burkina Faso       2
##  6 2013  Burkina Faso       6
##  7 2006  Cote d'Ivoire      1
##  8 2007  Cote d'Ivoire      2
##  9 2008  Cote d'Ivoire      1
## 10 2015  Gambia             1
## # … with 27 more rows
tgc_all %>% filter(Region=="Western Africa") %>% filter(Final_genotype=="2.3.2") %>% group_by(Year, Country_Origin) %>% summarise(n=n()) %>% arrange(Country_Origin, Year)
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
## # A tibble: 26 × 3
## # Groups:   Year [16]
##    Year  Country_Origin     n
##    <chr> <chr>          <int>
##  1 2012  Burkina Faso       1
##  2 2013  Burkina Faso       1
##  3 2008  Gambia             3
##  4 2009  Gambia             1
##  5 2010  Gambia             1
##  6 2011  Gambia             3
##  7 2012  Gambia             7
##  8 2013  Gambia             5
##  9 2014  Gambia             3
## 10 2010  Ghana              2
## # … with 16 more rows
# Middle Africa
genotypes_by_region %>% filter(Region=="Middle Africa") %>% filter(Year=="all")
## # A tibble: 4 × 8
## # Groups:   Region [1]
##   Region        Genotype    Year      x     N proportion p_lowerCI p_upperCI
##   <chr>         <chr>       <chr> <int> <int>      <dbl>     <dbl>     <dbl>
## 1 Middle Africa 2.1         all       2    21     0.0952     0         0.221
## 2 Middle Africa 2.5.1       all      16    21     0.762      0.580     0.944
## 3 Middle Africa 4.1.1       all       2    21     0.0952     0         0.221
## 4 Middle Africa 4.3.1.2.EA3 all       1    21     0.0476     0         0.139
genotypes_by_country %>% filter(Region=="Middle Africa") %>% filter(Year=="all")
## # A tibble: 4 × 9
## # Groups:   Region, Country_Origin [3]
##   Region        Country_Origin   Genotype Year      x     N proportion p_lowerCI
##   <chr>         <chr>            <chr>    <chr> <int> <int>      <dbl>     <dbl>
## 1 Middle Africa Angola           4.1.1    all       2     2     1          1    
## 2 Middle Africa Chad             2.1      all       2     2     1          1    
## 3 Middle Africa Democratic Repu… 2.5.1    all      16    17     0.941      0.829
## 4 Middle Africa Democratic Repu… 4.3.1.2… all       1    17     0.0588     0    
## # … with 1 more variable: p_upperCI <dbl>
tgc_from2010 %>% filter(Region=="Middle Africa") %>% select(Final_genotype, Year, Country_Origin, Country, `Travel Associated`, `Isolating Lab`)
## # A tibble: 21 × 6
##    Final_genotype Year  Country_Origin  Country `Travel Associ…` `Isolating Lab`
##    <chr>          <chr> <chr>           <chr>   <chr>            <chr>          
##  1 2.1            2019  Chad            France  Yes              IPA            
##  2 2.1            2019  Chad            France  Yes              IPA            
##  3 2.5.1          2016  Democratic Rep… USA     Yes              CDC            
##  4 4.3.1.2.EA3    2018  Democratic Rep… United… Yes              PHE            
##  5 4.1.1          2015  Angola          United… Yes              PHE            
##  6 4.1.1          2015  Angola          United… Yes              PHE            
##  7 2.5.1          2010  Democratic Rep… Democr… No               INR            
##  8 2.5.1          2011  Democratic Rep… Democr… No               INR            
##  9 2.5.1          2011  Democratic Rep… Democr… No               INR            
## 10 2.5.1          2011  Democratic Rep… Democr… No               INR            
## # … with 11 more rows
# Northern Africa
tgc_from2010 %>% filter(Region=="Northern Africa") %>% select(Final_genotype, Year, Country_Origin, Country, `Travel Associated`, `Isolating Lab`)
## # A tibble: 6 × 6
##   Final_genotype Year  Country_Origin Country   `Travel Associ…` `Isolating Lab`
##   <chr>          <chr> <chr>          <chr>     <chr>            <chr>          
## 1 1.1            2017  Morocco        USA       Yes              CDC            
## 2 3.3            2019  Tunisia        United K… Yes              PHE            
## 3 4              2018  Sudan          United K… Yes              PHE            
## 4 4              2018  Sudan          United K… Yes              PHE            
## 5 0.1            2017  Morocco        United K… Yes              PHE            
## 6 0.1            2015  Egypt          United K… Yes              PHE
# Central America
genotypes_by_region %>% filter(Region=="Central America") %>% filter(Year=="all")
## # A tibble: 7 × 8
## # Groups:   Region [1]
##   Region          Genotype   Year      x     N proportion p_lowerCI p_upperCI
##   <chr>           <chr>      <chr> <int> <int>      <dbl>     <dbl>     <dbl>
## 1 Central America 2          all       1   100       0.01    0         0.0295
## 2 Central America 2.0.2      all      24   100       0.24    0.156     0.324 
## 3 Central America 2.3.2      all      55   100       0.55    0.452     0.648 
## 4 Central America 4.1        all      17   100       0.17    0.0964    0.244 
## 5 Central America 4.3.1.1    all       1   100       0.01    0         0.0295
## 6 Central America 4.3.1.1.P1 all       1   100       0.01    0         0.0295
## 7 Central America 4.3.1.2.1  all       1   100       0.01    0         0.0295
genotypes_by_country %>% filter(Region=="Central America") %>% filter(Year=="all")
## # A tibble: 12 × 9
## # Groups:   Region, Country_Origin [4]
##    Region         Country_Origin Genotype Year      x     N proportion p_lowerCI
##    <chr>          <chr>          <chr>    <chr> <int> <int>      <dbl>     <dbl>
##  1 Central Ameri… El Salvador    2.0.2    all       2    19     0.105     0     
##  2 Central Ameri… El Salvador    2.3.2    all      17    19     0.895     0.757 
##  3 Central Ameri… Guatemala      2        all       1    22     0.0455    0     
##  4 Central Ameri… Guatemala      2.0.2    all       7    22     0.318     0.124 
##  5 Central Ameri… Guatemala      2.3.2    all       9    22     0.409     0.204 
##  6 Central Ameri… Guatemala      4.1      all       5    22     0.227     0.0522
##  7 Central Ameri… Mexico         2.0.2    all      15    58     0.259     0.146 
##  8 Central Ameri… Mexico         2.3.2    all      29    58     0.5       0.371 
##  9 Central Ameri… Mexico         4.1      all      12    58     0.207     0.103 
## 10 Central Ameri… Mexico         4.3.1.1… all       1    58     0.0172    0     
## 11 Central Ameri… Mexico         4.3.1.2… all       1    58     0.0172    0     
## 12 Central Ameri… Panama         4.3.1.1  all       1     1     1         1     
## # … with 1 more variable: p_upperCI <dbl>
tgc_all %>% filter(Region=="Central America") %>% group_by(`Isolating Lab`) %>% summarise(n=n())
## # A tibble: 5 × 2
##   `Isolating Lab`     n
##   <chr>           <int>
## 1 CDC                89
## 2 ESR                 1
## 3 IPA                 3
## 4 MDU                 2
## 5 PHE                 8
tgc_from2010 %>% filter(Region=="Central America") %>% group_by(Country_Origin) %>% summarise(n=n())
## # A tibble: 4 × 2
##   Country_Origin     n
##   <chr>          <int>
## 1 El Salvador       19
## 2 Guatemala         22
## 3 Mexico            58
## 4 Panama             1
tgc_from2010 %>% filter(Region=="Central America") %>% group_by(Country_Origin, Year) %>% summarise(n=n())
## `summarise()` has grouped output by 'Country_Origin'. You can override using
## the `.groups` argument.
## # A tibble: 15 × 3
## # Groups:   Country_Origin [4]
##    Country_Origin Year      n
##    <chr>          <chr> <int>
##  1 El Salvador    2012      1
##  2 El Salvador    2016      3
##  3 El Salvador    2017      2
##  4 El Salvador    2018      2
##  5 El Salvador    2019     11
##  6 Guatemala      2016      5
##  7 Guatemala      2017      3
##  8 Guatemala      2018      3
##  9 Guatemala      2019     11
## 10 Mexico         2011      1
## 11 Mexico         2016      6
## 12 Mexico         2017     13
## 13 Mexico         2018      8
## 14 Mexico         2019     30
## 15 Panama         2016      1
tgc_from2010 %>% filter(Region=="Central America") %>% group_by(Country_Origin, Final_genotype) %>% summarise(n=n()) %>% mutate(p=n/sum(n))
## `summarise()` has grouped output by 'Country_Origin'. You can override using
## the `.groups` argument.
## # A tibble: 12 × 4
## # Groups:   Country_Origin [4]
##    Country_Origin Final_genotype     n      p
##    <chr>          <chr>          <int>  <dbl>
##  1 El Salvador    2.0.2              2 0.105 
##  2 El Salvador    2.3.2             17 0.895 
##  3 Guatemala      2                  1 0.0455
##  4 Guatemala      2.0.2              7 0.318 
##  5 Guatemala      2.3.2              9 0.409 
##  6 Guatemala      4.1                5 0.227 
##  7 Mexico         2.0.2             15 0.259 
##  8 Mexico         2.3.2             29 0.5   
##  9 Mexico         4.1               12 0.207 
## 10 Mexico         4.3.1.1.P1         1 0.0172
## 11 Mexico         4.3.1.2.1          1 0.0172
## 12 Panama         4.3.1.1            1 1
tgc_all %>% filter(Region=="Central America") %>% filter(Year <2010) %>% select(Final_genotype, `Isolating Lab`, Country_Origin, Year)
## # A tibble: 3 × 4
##   Final_genotype `Isolating Lab` Country_Origin Year 
##   <chr>          <chr>           <chr>          <chr>
## 1 2.3.2          IPA             Mexico         1972 
## 2 2.3.2          IPA             Mexico         1998 
## 3 4.1            IPA             Mexico         1998
# Southern America
genotypes_by_region %>% filter(Region=="South America") %>% filter(Year=="all")
## # A tibble: 14 × 8
## # Groups:   Region [1]
##    Region        Genotype  Year      x     N proportion p_lowerCI p_upperCI
##    <chr>         <chr>     <chr> <int> <int>      <dbl>     <dbl>     <dbl>
##  1 South America 1.1       all      19   105    0.181     0.107      0.255 
##  2 South America 1.2.1     all       6   105    0.0571    0.0127     0.102 
##  3 South America 2         all      19   105    0.181     0.107      0.255 
##  4 South America 2.0.2     all       6   105    0.0571    0.0127     0.102 
##  5 South America 2.2       all       3   105    0.0286    0          0.0604
##  6 South America 2.3.2     all       1   105    0.00952   0          0.0281
##  7 South America 2.3.3     all       5   105    0.0476    0.00689    0.0884
##  8 South America 2.5       all       4   105    0.0381    0.00148    0.0747
##  9 South America 3         all       1   105    0.00952   0          0.0281
## 10 South America 3.3       all       1   105    0.00952   0          0.0281
## 11 South America 3.5       all      28   105    0.267     0.182      0.351 
## 12 South America 4.1       all       5   105    0.0476    0.00689    0.0884
## 13 South America 4.3.1.1   all       2   105    0.0190    0          0.0452
## 14 South America 4.3.1.2.1 all       5   105    0.0476    0.00689    0.0884
tgc_from2010 %>% filter(Region=="South America") %>% group_by(Country_Origin) %>% summarise(n=n()) %>% mutate(p=n/sum(n))
## # A tibble: 5 × 3
##   Country_Origin     n       p
##   <chr>          <int>   <dbl>
## 1 Brazil             1 0.00952
## 2 Chile             97 0.924  
## 3 French Guiana      3 0.0286 
## 4 Peru               3 0.0286 
## 5 Suriname           1 0.00952
genotypes_by_region %>% filter(Region=="South America") %>% filter(Year=="all") %>% filter(proportion>0.05) %>% arrange(proportion)
## # A tibble: 5 × 8
## # Groups:   Region [1]
##   Region        Genotype Year      x     N proportion p_lowerCI p_upperCI
##   <chr>         <chr>    <chr> <int> <int>      <dbl>     <dbl>     <dbl>
## 1 South America 1.2.1    all       6   105     0.0571    0.0127     0.102
## 2 South America 2.0.2    all       6   105     0.0571    0.0127     0.102
## 3 South America 1.1      all      19   105     0.181     0.107      0.255
## 4 South America 2        all      19   105     0.181     0.107      0.255
## 5 South America 3.5      all      28   105     0.267     0.182      0.351
tgc_all %>% filter(Country_Origin=="Colombia") %>% select(Final_genotype, Year) %>% group_by(Final_genotype, Year) %>% summarise(n=n()) %>% mutate(p=n/sum(n))
## `summarise()` has grouped output by 'Final_genotype'. You can override using
## the `.groups` argument.
## # A tibble: 30 × 4
## # Groups:   Final_genotype [4]
##    Final_genotype Year      n      p
##    <chr>          <chr> <int>  <dbl>
##  1 1.1            2004      1 1     
##  2 2              1997      1 0.2   
##  3 2              2000      1 0.2   
##  4 2              2003      1 0.2   
##  5 2              2004      1 0.2   
##  6 2              2018      1 0.2   
##  7 2.5            1999      4 0.0784
##  8 2.5            2000      4 0.0784
##  9 2.5            2005      4 0.0784
## 10 2.5            2006      1 0.0196
## # … with 20 more rows
tgc_all %>% filter(Country_Origin=="French Guiana") %>% select(Final_genotype, Year, `Isolating Lab`)
## # A tibble: 5 × 3
##   Final_genotype Year  `Isolating Lab`
##   <chr>          <chr> <chr>          
## 1 2.5            2002  IPA            
## 2 2.5            2002  IPA            
## 3 2.5            2019  IPA            
## 4 2.5            2018  IPA            
## 5 2.5            2016  IPA
tgc_all %>% filter(Final_genotype=="2.3.2") %>% group_by(Region, Country_Origin) %>% summarise(n=n())
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## # A tibble: 32 × 3
## # Groups:   Region [15]
##    Region           Country_Origin               n
##    <chr>            <chr>                    <int>
##  1 Caribbean        Dominican Republic           1
##  2 Caribbean        Haiti                        4
##  3 Caribbean        Turks and Caicos Islands     1
##  4 Central America  El Salvador                 17
##  5 Central America  Guatemala                    9
##  6 Central America  Mexico                      31
##  7 Middle Africa    Angola                       1
##  8 Northern America USA                         21
##  9 Northern Europe  United Kingdom               2
## 10 Polynesia        Samoa                        1
## # … with 22 more rows
# Pacific Islands
genotypes_by_country %>% filter(Country_Origin=="Papua New Guinea") %>% filter(Year=="all")
## # A tibble: 1 × 9
## # Groups:   Region, Country_Origin [1]
##   Region    Country_Origin   Genotype Year      x     N proportion p_lowerCI
##   <chr>     <chr>            <chr>    <chr> <int> <int>      <dbl>     <dbl>
## 1 Melanesia Papua New Guinea 2.1.7.1  all       5     5          1         1
## # … with 1 more variable: p_upperCI <dbl>
genotypes_by_country %>% filter(Country_Origin=="Samoa") %>% filter(Year=="all")
## # A tibble: 7 × 9
## # Groups:   Region, Country_Origin [1]
##   Region    Country_Origin Genotype Year      x     N proportion p_lowerCI
##   <chr>     <chr>          <chr>    <chr> <int> <int>      <dbl>     <dbl>
## 1 Polynesia Samoa          2.2.1    all       1   259    0.00386    0     
## 2 Polynesia Samoa          3.5.3    all       3   259    0.0116     0     
## 3 Polynesia Samoa          3.5.4    all       2   259    0.00772    0     
## 4 Polynesia Samoa          3.5.4.1  all      80   259    0.309      0.253 
## 5 Polynesia Samoa          3.5.4.2  all      72   259    0.278      0.223 
## 6 Polynesia Samoa          3.5.4.3  all      92   259    0.355      0.297 
## 7 Polynesia Samoa          4.1      all       9   259    0.0347     0.0124
## # … with 1 more variable: p_upperCI <dbl>
genotypes_by_country %>% filter(Country_Origin=="Fiji") %>% filter(Year=="all")
## # A tibble: 3 × 9
## # Groups:   Region, Country_Origin [1]
##   Region    Country_Origin Genotype Year      x     N proportion p_lowerCI
##   <chr>     <chr>          <chr>    <chr> <int> <int>      <dbl>     <dbl>
## 1 Melanesia Fiji           3.3.1    all       1    32     0.0312     0    
## 2 Melanesia Fiji           4.2.1    all       1    32     0.0312     0    
## 3 Melanesia Fiji           4.2.2    all      30    32     0.938      0.854
## # … with 1 more variable: p_upperCI <dbl>

Identify genotypes to include in map (ie with ≥20% freq in any country, 2010-2020)

# genotype freqs per region, from 2010
region_N <- tgc_from2010 %>%
  group_by(Region)%>% 
  summarize(region_total =n())

# genotype freqs per country, from 2010
country_N <- tgc_from2010 %>%
  group_by(Country_Origin)%>% 
  summarize(country_total =n())

country_geno <- tgc_from2010 %>%
  group_by(Country_Origin, Final_genotype)%>% 
  summarize(n =n()) %>%
  left_join(country_N) %>% # country counts
  mutate(geno_freq_country = round(n/country_total*100,0)) # percent per region
## `summarise()` has grouped output by 'Country_Origin'. You can override using
## the `.groups` argument.
## Joining, by = "Country_Origin"
# genotypes with ≥20% freq in any country; note this includes all H58 subtypes already
geno_country20 <- names(table(country_geno$Final_genotype[country_geno$geno_freq_country>=20]))


# set genotype colours
geno_info <- read_csv("https://raw.githubusercontent.com/katholt/genotyphi/main/Genotype_specification.csv") %>% filter(Genotype %in% geno_country20)
## Rows: 82 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Genotype, H58 Genotype, Gene in CT18 reference sequence, Product, ...
## dbl  (1): Position in CT18 reference sequence
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
geno_colours <- c(geno_info$`Colour code`,"grey")
names(geno_colours) <- c(geno_info$Genotype, "Other")
geno_colours <- geno_colours[order(names(geno_colours))] # sort for legend


# distinguish H58 subtypes
geno_colours2 <- geno_colours
geno_colours2["4.3.1.1.EA1"] = "#c27ba0"
geno_colours2["4.3.1.2.EA2"] = "#e250c4"
geno_colours2["4.3.1.2.EA3"] = "#94165f"
geno_colours2["4.3.1.2.1"] = "#980b5c"
geno_colours2["4.3.1.2.1.1"] = "#cc0000"

# colour new Samoa subtypes
geno_colours2["3.5.4.1"] = "#4b0082"
geno_colours2["3.5.4.2"] = "#7b2ab7"
geno_colours2["3.5.4.3"] = "#aa73d3"

# order by name
geno_colours2 <- geno_colours2[sort(names(geno_colours2))]

read in GPS data

# read gps coordinates
gps <- read.csv("../UN_region_coords.csv")%>%
  select(Region, latitude, longitude)

Prepare data for maps with pie charts showing genotype freqs

# pie data, 2010 - 2020, genotype >20% prevalent in an individual country since 2010
tgc_from2010_toMap_countryGeno <-  tgc_from2010 %>%
  mutate(map_genotype = if_else(Final_genotype %in% geno_country20, Final_genotype, "Other"))%>%
  select(Region, Year, Final_genotype, map_genotype)%>%
  dcast(Region ~ map_genotype)%>%
  left_join(gps)%>%
  left_join(region_N)%>%
  mutate(lat = as.numeric(latitude), 
         long = as.numeric(longitude),
         mytext=paste(
            Region, "\n", 
           "N= ", region_total, sep="")
         )
## Using map_genotype as value column: use value.var to override.
## Aggregation function missing: defaulting to length
## Joining, by = "Region"
## Joining, by = "Region"

set up to colour map to indicate countries contributing data

typhi_countryN <- tgc_meta %>%
  filter(Year >= 2010) %>% # restrict to last 10 years
  group_by(Country_Origin)%>%
  summarize(CountryCount2010=n())

# match map object country names
typhi_countryN$Country_Origin <- replace(typhi_countryN$Country_Origin, typhi_countryN$Country_Origin=="United Kingdom", "UK")
typhi_countryN$Country_Origin <- replace(typhi_countryN$Country_Origin, typhi_countryN$Country_Origin=="Viet Nam", "Vietnam")

# add N per country
typhi_Nmap <- map_data('world') %>%
        group_by(region) %>%
  # Merge in Typhi counts
  left_join(typhi_countryN, by = c('region' = 'Country_Origin'))

typhi_Nmap$CountryData <- typhi_Nmap$CountryCount2010 > 0

Fig 1a: map with pies (39 genotypes) without legend

genotype_pie_map_countryGeno_nolegend <- ggplot(data=typhi_Nmap, aes( x = long, y = lat, group=group)) +
  geom_polygon(data=typhi_Nmap, aes(fill = CountryData), color="grey") + 
  scale_fill_manual(values=c("#f2e6d9"), na.value="white") + # pale brown
  new_scale_fill() +
    geom_scatterpie(data = tgc_from2010_toMap_countryGeno, aes(x= long, y=lat , group = Region, r = 9),
                  cols= c(geno_country20, "Other"), color=NA, alpha=1) +
  scale_fill_manual(values = geno_colours2)+
  labs(fill = "Genotype")+
  ylab("")+
  xlab("")+
  theme_minimal()+
  theme(axis.text = element_blank(),
        axis.line = element_blank(),
        axis.ticks = element_blank(), 
        panel.border = element_blank(), 
        panel.grid = element_blank(), 
        legend.title = element_text(face = "bold"), 
        legend.position="none") +
  ggtitle("a) Genotype prevalence by world region, 2010-2020")

genotype_pie_map_countryGeno_nolegend
## Warning: Removed 258 rows containing non-finite values (stat_pie).

Fig S2: facet plot genotypes per region, normalised (%)

#select for 2010 - 2020, H58 genotypes plus any genotype >20% prevalent in an individual country
country_year_data_39geno <- tgc_meta %>%
  mutate(map_genotype = if_else(Final_genotype %in% geno_country20, Final_genotype, "Other"))%>%
   filter(Year >=2010) 

region <- table(country_year_data_39geno$Region)
region <- names(region[region>=20])

region_facet_39geno <- country_year_data_39geno %>%
  filter(Year <=2020) %>%
  filter(Region %in% region) %>%
  ggplot(aes(fill=map_genotype, x=Year)) + 
  geom_bar(position="fill") + 
  scale_fill_manual(values = geno_colours2)+
  labs(fill = "Genotype")+
  ylab("")+
  xlab("")+
  facet_wrap(~Region , ncol=3) + 
  theme_bw() +
  theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=8, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 6, colour = "black"), legend.text = element_text(size=8), legend.key.size=unit(0.25, "cm"), legend.title = element_text(size=8, face="bold")) +
  guides(fill=guide_legend(ncol=1))

region_facet_39geno

pdf(file=paste0("SuppFig2_region_facet.pdf"),width=7,height=8)
region_facet_39geno
dev.off()
## quartz_off_screen 
##                 2
png(paste0("SuppFig2_region_facet.png"), width=7, height=8, units="in", res=400)
region_facet_39geno
dev.off()
## quartz_off_screen 
##                 2

Fig S3: facet plot genotypes per country, excluding key countries, normalised (%)

countryN <- table(tgc_from2010$Country_Origin)
country50 <- names(countryN[countryN>=50])

country_facet_39geno_exclKey <- country_year_data_39geno %>%
  filter(!(Country_Origin %in% country50)) %>%
  ggplot(aes(fill=map_genotype, x=Year)) + 
  geom_bar(position="fill") + 
  scale_fill_manual(values = geno_colours2)+
  labs(fill = "Genotype")+
  ylab("")+
  xlab("")+
  facet_wrap(~Region + Country_Origin, ncol=6) + 
  theme_bw() +
  theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=8, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 6, colour = "black"), legend.text = element_text(size=10), legend.key.size=unit(0.25, "cm"), legend.title = element_text(size=10, face="bold")) +
  guides(fill=guide_legend(ncol=1))

country_facet_39geno_exclKey

pdf(file=paste0("SuppFig3_country_facet_exclKey.pdf"),width=13,height=18)
country_facet_39geno_exclKey
dev.off()
## quartz_off_screen 
##                 2
png(file=paste0("SuppFig3_country_facet_exclKey.png"),width=13,height=18, units="in", res=400)
country_facet_39geno_exclKey
dev.off()
## quartz_off_screen 
##                 2

Fig 1b:facet plot genotypes per country, normalised (%) - key countries

# create country labels with counts
country_label <- paste0(names(countryN), " (N=", countryN,")")
names(country_label) <- names(countryN)

#select for 2010 - 2020, H58 genotypes plus any genotype >20% prevalent in an individual country
country_facet_39geno_keycountries <- tgc_meta %>%
  mutate(map_genotype = if_else(Final_genotype %in% geno_country20, Final_genotype, "Other"))%>%
  select(Year, map_genotype, Region, Country_Origin)%>%
  filter(Year >=2010 & Year <=2020) %>%
  filter(Country_Origin %in% country50) %>%
  filter(!(Country_Origin %in% c("United Kingdom","USA"))) %>%
  mutate(country_label = country_label[Country_Origin]) %>%
  ggplot(aes(fill=map_genotype, x=Year)) + 
  geom_bar(position="fill") + 
  scale_fill_manual(values = geno_colours2)+
  labs(fill = "Genotype")+
  ylab("")+
  xlab("")+
  facet_wrap(~Region + country_label, ncol=3) + 
  theme_bw() +
  theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=8, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 6, colour = "black"), legend.text = element_text(size=8), legend.key.size=unit(0.25, "cm"), legend.title = element_text(size=8, face="bold")) +
  guides(fill=guide_legend(ncol=1))+
  ggtitle("b) Annual genotype prevalence, for countries with at least 50 genomes")

country_facet_39geno_keycountries

Fig 1: regional genotype map (a) plus national annual prevalences (b)

pdf(file=paste0("Fig1_map_genotypesMainCountries_facet.pdf"),width=7,height=9)
  genotype_pie_map_countryGeno_nolegend / (country_facet_39geno_keycountries +  guide_area() + plot_layout(widths=c(5,1))) + plot_layout(nrow=2, heights=c(2,3))
## Warning: Removed 258 rows containing non-finite values (stat_pie).
dev.off()
## quartz_off_screen 
##                 2
png(file=paste0("Fig1_map_genotypesMainCountries_facet.png"),width=7,height=9, units="in", res=400)
  genotype_pie_map_countryGeno_nolegend / (country_facet_39geno_keycountries +  guide_area() + plot_layout(widths=c(5,1))) + plot_layout(nrow=2, heights=c(2,3))
## Warning: Removed 258 rows containing non-finite values (stat_pie).
dev.off()
## quartz_off_screen 
##                 2

Section 3 - AMR

Fig S7a: facet plot genotypes per country, key countries, MDR

mdr_genotypes_key_countries <- tgc_meta %>%
  select(Year, Final_genotype, Region, Country_Origin, MDR)%>%
  filter(Year >=2010 & Year <=2020) %>%
  filter(Country_Origin %in% country50) %>%
  filter(!(Country_Origin %in% c("United Kingdom","USA"))) %>%
  mutate(MDR_geno = ifelse(MDR, Final_genotype, "Non-MDR"))

geno_colours_mdr <- geno_colours[names(table(mdr_genotypes_key_countries$MDR_geno))]
geno_colours_mdr["4.3.1.3"] <- geno_colours["4.3.1.3.Bdq"]
geno_colours_mdr["Non-MDR"] <- "lightgrey"
geno_colours_mdr <- na.omit(geno_colours_mdr)

mdr_genotypes_key_countries$MDR_geno <- factor(mdr_genotypes_key_countries$MDR_geno, levels=names(table(mdr_genotypes_key_countries$MDR_geno))[c(12,1:11)], ordered=T)

country_facet_39geno_keycountries_MDR <- mdr_genotypes_key_countries %>%
  ggplot(aes(fill=MDR_geno, x=Year)) + 
  geom_bar(position="fill") + 
  scale_fill_manual(values = geno_colours_mdr)+
  labs(fill = "Genotype")+
  ylab("")+
  xlab("")+
  facet_wrap(~Region + Country_Origin, ncol=3) + 
  theme_bw() +
  theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=10, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 8, colour = "black"), legend.text = element_text(size=8), legend.key.size=unit(0.25, "cm"), legend.title = element_text(size=8, face="bold")) +
  guides(fill=guide_legend(ncol=1))+
  ggtitle("a) Annual MDR genotype prevalence, for countries with at least 50 genomes")

country_facet_39geno_keycountries_MDR

Fig S7b: facet plot genotypes per country, key countries, CipI

cipNS_genotypes_key_countries <- tgc_meta %>%
  select(Year, Final_genotype, Region, Country_Origin, cipNS)%>%
  filter(Year >=2010 & Year <=2020) %>%
  filter(Country_Origin %in% country50) %>%
  filter(!(Country_Origin %in% c("United Kingdom","USA"))) %>%
  mutate(cipNS_geno = ifelse(cipNS, Final_genotype, "CipS")) %>%
  mutate(Final_genotype=replace(Final_genotype, Final_genotype=="4.3.1.2.1.1", "4.3.1.2.1"))

geno_cip <- names(table(cipNS_genotypes_key_countries$Final_genotype)[table(cipNS_genotypes_key_countries$Final_genotype)>=10])

geno_colours_cipI <- read_csv("https://raw.githubusercontent.com/katholt/genotyphi/main/Genotype_specification.csv") %>% filter(Genotype %in% geno_cip)
## Rows: 82 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Genotype, H58 Genotype, Gene in CT18 reference sequence, Product, ...
## dbl  (1): Position in CT18 reference sequence
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
geno_colours_cipNS <- c(geno_colours_cipI$`Colour code`, "grey")
names(geno_colours_cipNS) <- c(geno_colours_cipI$Genotype, "Other")
geno_colours_cipNS["4.3.1.2.1"] = "#980b5c"

geno_colours_cipNS <- geno_colours_cipNS[order(names(geno_colours_cipNS))]
geno_colours_cipNS["CipS"] <- "lightgrey"

cipNS_genotypes_key_countries$cipNS_geno <- factor(cipNS_genotypes_key_countries$cipNS_geno, levels=c("CipS",geno_cip), ordered=T)

country_facet_39geno_keycountries_cipNS <- cipNS_genotypes_key_countries %>%
  mutate(group_genotype = if_else(Final_genotype %in% geno_colours_cipI, Final_genotype, "Other"))%>%
  ggplot(aes(fill=cipNS_geno, x=Year)) + 
  geom_bar(position="fill") + 
  scale_fill_manual(values = geno_colours_cipNS)+
  labs(fill = "Genotype")+
  ylab("")+
  xlab("")+
  facet_wrap(~Region + Country_Origin, ncol=3) + 
  theme_bw() +
  theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=12, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 8, colour = "black"), legend.text = element_text(size=8), legend.key.size=unit(0.25, "cm"), legend.title = element_text(size=8, face="bold")) +
  guides(fill=guide_legend(ncol=1))+
  ggtitle("b) Annual CipNS genotype prevalence, for countries with at least 50 genomes")

country_facet_39geno_keycountries_cipNS

Fig S7: MDR and CipNS genotypes, countries with ≥50 genomes

pdf(file=paste0("SuppFig7_genotypesMainCountries_MDR_CipNS.pdf"),width=9,height=12)
country_facet_39geno_keycountries_MDR / country_facet_39geno_keycountries_cipNS
dev.off()
## quartz_off_screen 
##                 2
png(file=paste0("SuppFig7_genotypesMainCountries_MDR_CipNS.png"),width=9,height=12, units="in", res=400)
country_facet_39geno_keycountries_MDR / country_facet_39geno_keycountries_cipNS
dev.off()
## quartz_off_screen 
##                 2

Section 4 - Compare genotype prevalence estimates by source lab

#select for 2010 - 2020, H58 genotypes plus any genotype >20% prevalent in an individual country
country_lab_year_data <- tgc_from2010 %>%
  mutate(map_genotype = if_else(Final_genotype %in% geno_country20, Final_genotype, "Other"))%>%
  mutate(Country_lab_label = paste(Region, ":", Country_Origin, ":", `Isolating Lab`)) %>%
  select(Country_lab_label, Year, map_genotype, Final_genotype, Region, Country_Origin, `Isolating Lab`)%>%
  filter(Year >=2010) 

Compare key genotype frequencies across countries, by lab - South Asia

country_lab_year_data_SA <- country_lab_year_data %>% 
  filter(Country_Origin %in% c("Bangladesh", "India", "Nepal", "Pakistan"))

country_lab_year_data_SA <- country_lab_year_data_SA %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CDC", "*US (CDC)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PHE", "*England (PHE)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="MDU", "*Australia (MDU)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="ESR", "*New Zealand (ESR)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CHR", "Dhaka (CHR)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="ICD", "Dhaka (ICD)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CMH", "Chittagong (ICD)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="AII", "New Delhi (AII)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CML", "Ludhiana (CML)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CNH", "New Delhi (CNH)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KHM", "Mumbai (KHM)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KKC", "Chennai (KKC)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="RDT", "Bathalapalli (RDT)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="SAP", "New Delhi (SAP)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CMC", "Vellore (CMC)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="BCH", "Mumbai (KIM)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PGI", "Chandigarh (PGI)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="SJB", "Bengaluru (SJB)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KUH", "Dhulikhel (KUH)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PAH", "Lalitpur (PAH)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="AKU", "Karachi (AKU)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="JHL", "Lahore (JHL)"))

lab_N_SA <- country_lab_year_data_SA %>%
  group_by(Country_Origin, `Isolating Lab`)%>% 
  summarize(lab_total =n())
## `summarise()` has grouped output by 'Country_Origin'. You can override using
## the `.groups` argument.
lab_N_SA
## # A tibble: 46 × 3
## # Groups:   Country_Origin [4]
##    Country_Origin `Isolating Lab`    lab_total
##    <chr>          <chr>                  <int>
##  1 Bangladesh     *Australia (MDU)          20
##  2 Bangladesh     *England (PHE)           104
##  3 Bangladesh     *US (CDC)                 88
##  4 Bangladesh     Chittagong (ICD)          26
##  5 Bangladesh     Dhaka (CHR)              821
##  6 Bangladesh     Dhaka (ICD)              528
##  7 Bangladesh     IDS                        4
##  8 India          *Australia (MDU)         182
##  9 India          *England (PHE)           441
## 10 India          *New Zealand (ESR)        47
## # … with 36 more rows
country_lab_data_SA_genotypeFreq <- country_lab_year_data_SA %>% 
  group_by(Country_Origin, `Isolating Lab`, Final_genotype) %>%
  summarise (n=n())%>%
  left_join(lab_N_SA) %>% # annual lab counts
  mutate(lab_freq = n/lab_total, 
         lab_freq_sd = sqrt((lab_freq*(1-lab_freq))/lab_total),
         min=lab_freq-1.96*lab_freq_sd,
         max=lab_freq+1.96*lab_freq_sd)
## `summarise()` has grouped output by 'Country_Origin', 'Isolating Lab'. You can
## override using the `.groups` argument.
## Joining, by = c("Country_Origin", "Isolating Lab")
# 4.3.1.2 in India
country_lab_data_SA_genotypeFreq %>% filter(Country_Origin=="India") %>% filter(Final_genotype=="4.3.1.2")
## # A tibble: 19 × 9
## # Groups:   Country_Origin, Isolating Lab [19]
##    Country_Origin `Isolating Lab`    Final_genotype     n lab_total lab_freq
##    <chr>          <chr>              <chr>          <int>     <int>    <dbl>
##  1 India          *Australia (MDU)   4.3.1.2           49       182   0.269 
##  2 India          *England (PHE)     4.3.1.2          134       441   0.304 
##  3 India          *New Zealand (ESR) 4.3.1.2           11        47   0.234 
##  4 India          *US (CDC)          4.3.1.2          115       359   0.320 
##  5 India          Bathalapalli (RDT) 4.3.1.2            2        31   0.0645
##  6 India          Bengaluru (SJB)    4.3.1.2           57       124   0.460 
##  7 India          Chandigarh (PGI)   4.3.1.2           42       237   0.177 
##  8 India          Chennai (KKC)      4.3.1.2           30        81   0.370 
##  9 India          IDS                4.3.1.2            3         5   0.6   
## 10 India          LWH                4.3.1.2            1        12   0.0833
## 11 India          MCL                4.3.1.2            1        10   0.1   
## 12 India          Mumbai (KHM)       4.3.1.2            6        32   0.188 
## 13 India          Mumbai (KIM)       4.3.1.2           15        92   0.163 
## 14 India          New Delhi (AII)    4.3.1.2            6        31   0.194 
## 15 India          New Delhi (CNH)    4.3.1.2           24       117   0.205 
## 16 India          New Delhi (SAP)    4.3.1.2            2        25   0.08  
## 17 India          NIC                4.3.1.2            8        18   0.444 
## 18 India          TNM                4.3.1.2            3        18   0.167 
## 19 India          Vellore (CMC)      4.3.1.2          241       336   0.717 
## # … with 3 more variables: lab_freq_sd <dbl>, min <dbl>, max <dbl>

add pooled country-wide estimate with CIs

# add pooled estimate
country_lab_data_SA_genotypeFreq <- tgc_from2010_countryFreq_allGeno %>% 
  filter(Country_Origin %in% c("India", "Bangladesh","Nepal","Pakistan")) %>%
  rename(lab_freq=p, min=p_lowerCI, max=p_upperCI, lab_total=N, n=x) %>%
  mutate(`Isolating Lab`="(Pooled)") %>%
  bind_rows(country_lab_data_SA_genotypeFreq) %>%
  ungroup() %>%
  select(-c(Region, lab_freq_sd))

facet plot for genotype prevalence estimates by lab, South Asia

# choose most common genotypes to plot
SA_geno_freq <- rev(sort(table(country_lab_year_data_SA$Final_genotype)))

country_lab_data_SA_genotypeFreq_estimates <-
country_lab_data_SA_genotypeFreq %>%
  mutate(pooled=if_else(`Isolating Lab`=="(Pooled)",1,0.5)) %>%
  filter(Final_genotype %in% names(SA_geno_freq[1:9])) %>%
  filter(lab_total >=20) %>%
  mutate(country_lab=paste(Country_Origin, ":", `Isolating Lab`)) %>%
  ggplot() +
  geom_linerange( mapping=aes(y=country_lab,  xmin=min, xmax=max, colour=Country_Origin)) +
  geom_point(mapping=aes(y=country_lab, x=lab_freq, colour=Country_Origin, alpha=pooled), size=1) +
  facet_wrap(. ~ Final_genotype, ncol=3) + theme_bw() +
  scale_color_manual(values=c("red","blue","orange","black")) + 
  guides(alpha = "none") +
  labs(color="Country:", x="Genotype prevalence estimate (%)", y="Source laboratory") + 
  theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=10, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 8, colour = "black"), axis.text.y = element_text(size = 7, colour = "black"), legend.title = element_text(size=10, face="bold"), legend.position="bottom")

country_lab_data_SA_genotypeFreq_estimates

country_lab_data_SA_genotypeFreq
## # A tibble: 468 × 8
##    Country_Origin Final_genotype     n lab_total lab_freq      min     max
##    <chr>          <chr>          <int>     <int>    <dbl>    <dbl>   <dbl>
##  1 Bangladesh     2                 17      1591 0.0107   0.00563  0.0157 
##  2 Bangladesh     2.0.1            103      1591 0.0647   0.0526   0.0768 
##  3 Bangladesh     2.1.7             34      1591 0.0214   0.0143   0.0285 
##  4 Bangladesh     2.2.1              1      1591 0.000629 0        0.00186
##  5 Bangladesh     2.3.3            274      1591 0.172    0.154    0.191  
##  6 Bangladesh     2.4                1      1591 0.000629 0        0.00186
##  7 Bangladesh     3.0.1              6      1591 0.00377  0.000759 0.00678
##  8 Bangladesh     3.1.2             12      1591 0.00754  0.00329  0.0118 
##  9 Bangladesh     3.2                1      1591 0.000629 0        0.00186
## 10 Bangladesh     3.2.2            264      1591 0.166    0.148    0.184  
## # … with 458 more rows, and 1 more variable: `Isolating Lab` <chr>

Fig S12: genotype freq estimates by lab, South Asia

pdf(file=paste0("SuppFig12_country_lab_data_SA_genotypeFreq_estimates.pdf"),width=8,height=12)
country_lab_data_SA_genotypeFreq_estimates
dev.off()
## quartz_off_screen 
##                 2
png(file=paste0("SuppFig12_country_lab_data_SA_genotypeFreq_estimates.png"),width=8,height=12, units="in", res=400)
country_lab_data_SA_genotypeFreq_estimates
dev.off()
## quartz_off_screen 
##                 2

compare CIs

# lab-specific
lab_level <- country_lab_data_SA_genotypeFreq %>% 
  filter(`Isolating Lab` != "(Pooled)")

pooled <- country_lab_data_SA_genotypeFreq %>% 
  filter(`Isolating Lab` == "(Pooled)") %>%
  rename(pooled_freq=lab_freq, pooled_min=min, pooled_max=max)

labs_pooled <- left_join(lab_level, pooled, by=c("Country_Origin", "Final_genotype"))

labs_pooled <- labs_pooled %>% 
  filter(Final_genotype %in% names(SA_geno_freq[1:9])) %>%
  filter(lab_total.x >=20)

nrow(labs_pooled)
## [1] 142
labs_pooled_overlap_estimate <- labs_pooled %>%
  filter((lab_freq > pooled_min) & (lab_freq < pooled_max))

nrow(labs_pooled_overlap_estimate)/nrow(labs_pooled)
## [1] 0.2394366
labs_pooled_overlap_interval <- labs_pooled %>%
  filter(!( (max<pooled_min) | (min>pooled_max)))

nrow(labs_pooled_overlap_interval)/nrow(labs_pooled)
## [1] 0.7042254

Fig 6a: Genotype freqs annual dist per lab, 2010-2020 - South Asia, labs with N≥20

country_lab_year_data_SA <- country_lab_year_data %>% 
  filter(Country_Origin %in% c("Bangladesh", "India", "Nepal", "Pakistan"))

# set genotype colours
geno_info_SA <- read_csv("https://raw.githubusercontent.com/katholt/genotyphi/main/Genotype_specification.csv") %>% filter(Genotype %in% country_lab_year_data_SA$Final_genotype)
## Rows: 82 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Genotype, H58 Genotype, Gene in CT18 reference sequence, Product, ...
## dbl  (1): Position in CT18 reference sequence
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
geno_colours_SA <- c(geno_info_SA$`Colour code`)

names(geno_colours_SA) <- c(geno_info_SA$Genotype)

# distinguish H58 subtypes
geno_colours_SA["4.3.1.2.EA2"] = "#ee37c9"
geno_colours_SA["4.3.1.2.EA3"] = "#94165f"
geno_colours_SA["4.3.1.2.1"] = "#980b5c"

geno_colours_SA <- geno_colours_SA[order(names(geno_colours_SA))] # sort for legend

lab_counts <- table(country_lab_year_data$Country_lab_label)

# South Asian countries with multiple source labs, with N≥20 per lab
country_facet_lab_SAsia <- country_lab_year_data_SA %>% 
  filter(Country_lab_label %in% names(lab_counts[lab_counts>=20])) %>%
  ggplot(aes(fill=Final_genotype, x=Year)) + 
  geom_bar(position="fill") + 
  scale_fill_manual(values = geno_colours_SA)+
  labs(fill = "Genotype", x="", y="")+
  facet_wrap(~Country_Origin + `Isolating Lab`, ncol=7) + 
  theme_bw() +
  theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=9, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 6, colour = "black"), legend.title=element_text(size=8, face="bold"), legend.text = element_text(size=8), legend.key.size=unit(.25, "cm"), legend.position="bottom") +
  guides(fill=guide_legend(nrow=4))

country_facet_lab_SAsia

Genotype per lab, frequence barplot, Nigeria

country_lab_year_data_Nigeria <- country_lab_year_data %>% 
  filter(Country_Origin == "Nigeria")

# set genotype colours
geno_info_nigeria <- read_csv("https://raw.githubusercontent.com/katholt/genotyphi/main/Genotype_specification.csv") %>% filter(Genotype %in% country_lab_year_data_Nigeria$Final_genotype)
## Rows: 82 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Genotype, H58 Genotype, Gene in CT18 reference sequence, Product, ...
## dbl  (1): Position in CT18 reference sequence
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
geno_colours_nigeria <- c(geno_info_nigeria$`Colour code`,"grey")

names(geno_colours_nigeria) <- c(geno_info_nigeria$Genotype, "Other")

geno_colours_nigeria <- geno_colours_nigeria[order(names(geno_colours_nigeria))] # sort for legend

country_facet_lab_Nigeria <- country_lab_year_data_Nigeria %>% 
  filter(Country_lab_label %in% names(lab_counts[lab_counts>=10])) %>%
  filter(!is.na(`Isolating Lab`)) %>%
  ggplot(aes(fill=Final_genotype, x=Year)) + 
  geom_bar(position="fill") + 
  scale_fill_manual(values = geno_colours_nigeria)+
  labs(fill = "Genotype", x="", y="", title="c) Annual genotype prevalence, by lab")+
  facet_wrap(~`Isolating Lab`, ncol=2) + 
  theme_bw() +
  theme(strip.background = element_rect(fill ="#f0f0f5"),
        strip.text=element_text(size=10, face="bold"), 
        axis.text.x = element_text(angle = 45, hjust = 1, size = 6, colour = "black"),
        axis.text.y = element_text(size = 7, colour = "black"),
        legend.title=element_text(size=8, face="bold"), 
        legend.text = element_text(size=8), legend.key.size=unit(.25, "cm"),
        legend.position="right", 
        plot.title=element_text(size = 9, colour = "black", face="bold"))

country_facet_lab_Nigeria

Compare genotype frequency estimates between labs, Nigeria

lab_N_Nigeria <- country_lab_year_data_Nigeria %>%
  group_by(`Isolating Lab`)%>% 
  summarize(lab_total =n())

lab_N_Nigeria
## # A tibble: 5 × 2
##   `Isolating Lab` lab_total
##   <chr>               <int>
## 1 AKT                     6
## 2 CDC                    10
## 3 IBA                    14
## 4 PHE                    15
## 5 ZMC                   105
country_lab_data_Nigeria_genotypeFreq <- country_lab_year_data_Nigeria %>% 
  group_by(`Isolating Lab`, Final_genotype) %>%
  summarise (n=n())%>%
  left_join(lab_N_Nigeria) %>% # annual lab counts
  mutate(lab_freq = n/lab_total, 
         lab_freq_sd = sqrt((lab_freq*(1-lab_freq))/lab_total),
         min=lab_freq-1.96*lab_freq_sd,
         max=lab_freq+1.96*lab_freq_sd)
## `summarise()` has grouped output by 'Isolating Lab'. You can override using the
## `.groups` argument.
## Joining, by = "Isolating Lab"
country_lab_data_Nigeria_genotypeFreq %>% filter(Final_genotype=="3.1.1")
## # A tibble: 5 × 8
## # Groups:   Isolating Lab [5]
##   `Isolating Lab` Final_genotype     n lab_total lab_freq lab_freq_sd   min
##   <chr>           <chr>          <int>     <int>    <dbl>       <dbl> <dbl>
## 1 AKT             3.1.1              5         6    0.833      0.152  0.535
## 2 CDC             3.1.1              7        10    0.7        0.145  0.416
## 3 IBA             3.1.1             14        14    1          0      1    
## 4 PHE             3.1.1              8        15    0.533      0.129  0.281
## 5 ZMC             3.1.1             67       105    0.638      0.0469 0.546
## # … with 1 more variable: max <dbl>

add pooled country-wide estimate with CIs

# add pooled estimate
country_lab_data_Nigeria_genotypeFreq <- tgc_from2010_countryFreq_allGeno %>% 
  filter(Country_Origin %in% c("Nigeria")) %>%
  rename(lab_freq=p, min=p_lowerCI, max=p_upperCI, lab_total=N, n=x) %>%
  mutate(`Isolating Lab`="(Pooled)") %>%
  bind_rows(country_lab_data_Nigeria_genotypeFreq) %>%
  ungroup() %>%
  select(-c(Region, lab_freq_sd, Country_Origin))

facet line plot of estimates for key genotypes by lab, Nigeria

# choose most common genotypes to plot
nigeria_geno_freq <- rev(sort(table(country_lab_year_data_Nigeria$Final_genotype)))

country_lab_data_Nigeria_genotypeFreq_estimates <-
country_lab_data_Nigeria_genotypeFreq %>%
  mutate(pooled=if_else(`Isolating Lab`=="(Pooled)","Pooled","Individual")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CDC", "*US (CDC)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PHE", "*England (PHE)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="IBA", "Ibadan (IBD)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="ZMC", "Abuja (ZMC)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="(Pooled)", "Nigeria (Pooled)")) %>%
  filter(Final_genotype %in% names(nigeria_geno_freq[1:5])) %>%
  filter(lab_total >=10) %>%
  ggplot() +
  geom_linerange( mapping=aes(y=`Isolating Lab`,  xmin=min, xmax=max, colour=pooled)) +
  geom_point(mapping=aes(y=`Isolating Lab`, x=lab_freq, colour=pooled), size=1) +
  facet_wrap(. ~ Final_genotype, ncol=3) + theme_bw() +
  scale_color_manual(values=c("red","black")) + 
  xlab("") +
  ylab("") +
  labs(color="Estimate type")+ 
  ggtitle("a) Genotype prevalence, by lab") +
  theme(strip.background = element_rect(fill ="#f0f0f5"),
        strip.text=element_text(size=10, face="bold"), 
        axis.text.x = element_text(angle = 45, hjust = 1, size = 6, colour = "black"),
        axis.text.y = element_text(size = 7, colour = "black"),
        legend.title=element_text(size=8, face="bold"), 
        legend.text = element_text(size=8), legend.key.size=unit(.25, "cm"),
        legend.position="right", 
        plot.title=element_text(size = 9, colour = "black", face="bold"))

country_lab_data_Nigeria_genotypeFreq_estimates

Section 4b - Compare AMR estimates by source lab

Fig 6b: tabulate annual resistance freqs per country and lab, for South Asia

annual_res_country_lab <- tgc_meta %>%
  mutate(cefR = as.numeric(CEP), 
         XDR = if_else(XDR == T, 1, 0), 
         MDR = if_else(MDR ==T, 1, 0), 
         cipNS = if_else(cipNS == T, 1, 0), 
         cipR = if_else(cipR == T, 1, 0), 
         aziR = if_else(aziR == T, 1, 0)) %>% #convert PW AMR calls to numeric values
  filter(Year >= 2010) %>% # restrict to 2010 onwards
    group_by(Country_Origin, Year, `Isolating Lab`) %>%
  summarize(n=n(),
            sumMDR= sum(MDR),
            MDR= round(sumMDR/n*100, digits =0),
            sumXDR= sum(XDR),
            XDR= round(sumXDR/n*100, digits = 0),
            sumcipNS = sum(cipNS),
            cipNS = round(sumcipNS/n*100, digits = 0),
            sumcipR = sum(cipR),
            cipR = round(sumcipR/n*100, digits = 0),
            sumcefR = sum(cefR),
            cefR = round(sumcefR/n*100, digits = 0),
            sumaziR = sum(aziR),
            aziR = round(sumaziR/n*100, digits =0)
            ) %>% 
  unique()
## `summarise()` has grouped output by 'Country_Origin', 'Year'. You can override
## using the `.groups` argument.

pooled estimates (borrowed from AMR Rmd)

# clean up AMR data (for surveillance-ready samples)
res_data <- tgc_meta %>%
  mutate(cefR = as.numeric(CEP), 
         XDR = if_else(XDR == T, 1, 0), 
         MDR = if_else(MDR ==T, 1, 0), 
         cipNS = if_else(cipNS == T, 1, 0), 
         cipR = if_else(cipR == T, 1, 0), 
         aziR = if_else(aziR == T, 1, 0)) %>% #convert PW AMR calls to numeric values
  select(Country_Origin, Region, Year, cefR, cipNS, cipR, MDR, XDR, aziR, IncHI1)

# map object has "UK", "Vietnam", changes ours to match this string format
res_data$Country_Origin <- replace(res_data$Country_Origin, res_data$Country_Origin=="United Kingdom", "UK")
res_data$Country_Origin <- replace(res_data$Country_Origin, res_data$Country_Origin=="Viet Nam", "Vietnam")

# prevalence per country, for 2010 onwards
res_freq_table <- res_data %>%
  filter(Year >= 2010) %>% # restrict to last 10 years
  group_by(Country_Origin)%>%
  summarize(n=n(),
            sumMDR= sum(MDR),
            MDR= round(sumMDR/n*100, digits =0),
            sumXDR= sum(XDR),
            XDR= round(sumXDR/n*100, digits = 0),
            sumcipNS = sum(cipNS),
            cipNS = round(sumcipNS/n*100, digits = 0),
            sumcipR = sum(cipR),
            cipR = round(sumcipR/n*100, digits = 0),
            sumcefR = sum(cefR),
            cefR = round(sumcefR/n*100, digits = 0),
            sumaziR = sum(aziR),
            aziR = round(sumaziR/n*100, digits =0),
            MDR_sd = sqrt((sumMDR/n*(1-sumMDR/n))/n),
            XDR_sd = sqrt((sumXDR/n*(1-sumXDR/n))/n),
            cipNS_sd = sqrt((sumcipNS/n*(1-sumcipNS/n))/n),
            cipR_sd = sqrt((sumcipR/n*(1-sumcipR/n))/n),
            cefR_sd = sqrt((sumcefR/n*(1-sumcefR/n))/n),
            aziR_sd = sqrt((sumaziR/n*(1-sumaziR/n))/n)
            ) %>% 
  unique()

# annual by country
res_freq_annual_table <- res_data %>%
  filter(Year >= 2000) %>% # restrict to last 20 years
  mutate(MDR_IncHI1 = ifelse(IncHI1, MDR, 0)) %>%
  group_by(Country_Origin, Year) %>%
  summarize(n=n(),
            sumMDR= sum(MDR),
            MDR= round(sumMDR/n*100, digits =0),
            sumXDR= sum(XDR),
            XDR= round(sumXDR/n*100, digits = 0),
            sumcipNS = sum(cipNS),
            cipNS = round(sumcipNS/n*100, digits = 0),
            sumcipR = sum(cipR),
            cipR = round(sumcipR/n*100, digits = 0),
            sumcefR = sum(cefR),
            cefR = round(sumcefR/n*100, digits = 0),
            sumaziR = sum(aziR),
            aziR = round(sumaziR/n*100, digits =0),
            MDR_sd = sqrt((sumMDR/n*(1-sumMDR/n))/n),
            XDR_sd = sqrt((sumXDR/n*(1-sumXDR/n))/n),
            cipNS_sd = sqrt((sumcipNS/n*(1-sumcipNS/n))/n),
            cipR_sd = sqrt((sumcipR/n*(1-sumcipR/n))/n),
            cefR_sd = sqrt((sumcefR/n*(1-sumcefR/n))/n),
            aziR_sd = sqrt((sumaziR/n*(1-sumaziR/n))/n),
            sumMDR_IncHI1 = sum(MDR_IncHI1),
            `IncHI1 amongst MDR` = round(sumMDR_IncHI1/sumMDR*100, digits=0)) %>% 
  unique()
## `summarise()` has grouped output by 'Country_Origin'. You can override using
## the `.groups` argument.

Compare annual AMR for labs within countries - line plots, South Asia

southAsia_amr_lab <- annual_res_country_lab %>%
  filter(Country_Origin %in% c("Bangladesh","India","Nepal","Pakistan")) %>%
  filter(n>=10) %>% # include only estimates with N≥10
  as.data.frame() %>%
    melt(id.vars=c("Country_Origin","Year","Isolating Lab"), variable.name="AMR") %>%
    filter(AMR %in% c("MDR","cefR","cipR","cipNS")) 

southAsia_amr_lab$value <- as.numeric(as.character(southAsia_amr_lab$value))

southAsia_amr_lab$Year <- as.numeric(as.character(southAsia_amr_lab$Year))

res_freq_annual_table$Year <- as.numeric(as.character(res_freq_annual_table$Year))

## add line for combined estimate
south_asia_amr <- res_freq_annual_table %>% 
  filter(Country_Origin %in% c("Bangladesh","India","Nepal","Pakistan")) %>%
  filter(n>=10) %>% # plot only those years with sufficient data
  as.data.frame() %>%
  melt(id.vars=c("Country_Origin","Year"), variable.name="AMR") %>%
  filter(AMR %in% c("MDR","cefR","cipR","cipNS")) %>% 
  mutate(`Isolating Lab`="Pooled") %>%
  bind_rows(southAsia_amr_lab) %>%
  mutate(estimate_type=ifelse(`Isolating Lab`=="Pooled",2,1)) %>%
  replace_na(list(estimate_type = 1, `Isolating Lab` = "UNK")) # NA lab is unknown lab from Pakistan data in Wong 2015

# order levels
south_asia_amr$`Isolating Lab` <- factor(south_asia_amr$`Isolating Lab`)
index_combined <- which("Pooled" == levels(south_asia_amr$`Isolating Lab`))

south_asia_amr$`Isolating Lab` <- factor(south_asia_amr$`Isolating Lab`, levels=levels(south_asia_amr$`Isolating Lab`)[c(index_combined, 1:(index_combined-1), (index_combined+1):length(levels(south_asia_amr$`Isolating Lab`)))], ordered=T)

southAsia_amr_lab_plot_combinedEst <- ggplot(south_asia_amr, aes(x=Year, y=value, group=`Isolating Lab`, colour=`Isolating Lab`)) +
  geom_line(size = 1, aes(group=`Isolating Lab`)) +
  geom_point(size = 1) +
  facet_wrap(. ~ Country_Origin + AMR) + theme_bw() + 
  scale_color_manual(values=c(alpha("black",1), alpha(unname(alphabet())[1:22],0.5))) + 
  ylab("Resistance (%)") +
  theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=10, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 8, colour = "black"), legend.title = element_text(size=10, face="bold"), legend.position="bottom") +
  guides(colour=guide_legend(nrow=3))

southAsia_amr_lab_plot_combinedEst

Fig 6: combined fig with annual AMR & genotype for South Asia

pdf(file=paste0("Fig6_annualGeno_annualAMR_by_lab_SouthAsia.pdf"),width=8,height=12)
(country_facet_lab_SAsia + ggtitle("a) Annual genotype prevalence, by lab")) / (southAsia_amr_lab_plot_combinedEst + ggtitle("b) Annual AMR prevalence, by lab"))
dev.off()
## quartz_off_screen 
##                 2
png(file=paste0("Fig6_annualGeno_annualAMR_by_lab_SouthAsia.png"),width=8, height=12, units="in", res=400)
(country_facet_lab_SAsia + ggtitle("a) Annual genotype prevalence, by lab")) / (southAsia_amr_lab_plot_combinedEst + ggtitle("b) Annual AMR prevalence, by lab"))
dev.off()
## quartz_off_screen 
##                 2

Compare AMR frequencies across countries, by lab - South Asia

country_lab_data_SA_amrFreq <- tgc_from2010 %>% 
  filter(Country_Origin %in% c("Bangladesh", "India", "Nepal", "Pakistan")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CDC", "*US (CDC)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PHE", "*England (PHE)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="MDU", "*Australia (MDU)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="ESR", "*New Zealand (ESR)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CHR", "Dhaka (CHR)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="ICD", "Dhaka (ICD)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CMH", "Chittagong (ICD)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="AII", "New Delhi (AII)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CML", "Ludhiana (CML)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CNH", "New Delhi (CNH)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KHM", "Mumbai (KHM)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KKC", "Chennai (KKC)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="RDT", "Andhra Pradesh (RDT)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="SAP", "New Delhi (SAP)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CMC", "Vellore (CMC)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KIMS", "Mumbai (KIM)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PGI", "Chandigarh (PGI)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="SJB", "Bengaluru (SJB)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="KUH", "Dhulikhel (KUH)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PAH", "Lalitpur (PAH)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="AKU", "Karachi (AKU)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="JHL", "Lahore (JHL)")) %>%
  select(Country_Origin, Region, `Isolating Lab`, MDR, cipNS, cipR, aziR, CEP, XDR) %>%
  rename(cefR=CEP) %>%
  mutate(Country_lab_label = paste(Country_Origin, ":", `Isolating Lab`)) %>%
  select(-Region, -`Isolating Lab`) %>%
  melt(id.vars=c("Country_lab_label","Country_Origin"), variable.name="AMR") %>%
  group_by(Country_lab_label, Country_Origin, AMR) %>%
  summarise (n=n(),
             x=sum(value),
             .groups = "keep") %>%
  mutate(lab_freq = x/n, 
         lab_freq_sd = sqrt((lab_freq*(1-lab_freq))/n),
         min=lab_freq-1.96*lab_freq_sd,
         max=lab_freq+1.96*lab_freq_sd)

add pooled country-wide estimate with CIs

# add pooled estimate
SA_amrFreq <- tgc_from2010 %>% 
  select(Country_Origin, MDR, cipNS, cipR, aziR, CEP, XDR) %>%
  rename(cefR=CEP) %>%
  filter(Country_Origin %in% c("Bangladesh", "India", "Nepal", "Pakistan")) %>%
melt(id.vars=c("Country_Origin"), variable.name="AMR") %>%
  group_by(Country_Origin, AMR) %>%
  summarise (n=n(),
             x=sum(value),
             .groups = "keep") %>%
  mutate(lab_freq = x/n, 
         lab_freq_sd = sqrt((lab_freq*(1-lab_freq))/n),
         min=lab_freq-1.96*lab_freq_sd,
         max=lab_freq+1.96*lab_freq_sd) %>%
  mutate(Country_lab_label=paste0(Country_Origin, " : (Pooled)")) %>%
  bind_rows(country_lab_data_SA_amrFreq)

Fig S13: facet estimate plot for AMR freqs in South Asia labs

country_lab_data_SA_amrFreq_estimates <-
SA_amrFreq %>%
  mutate(pooled=if_else(grepl("(Pooled)", Country_lab_label),1,0.5)) %>%
  filter(n >=20) %>% #minimum annual count per lab
  ggplot() +
  geom_linerange( mapping=aes(y=Country_lab_label, xmin=min, xmax=max, colour=Country_Origin)) +
  geom_point(mapping=aes(y=Country_lab_label, x=lab_freq, colour=Country_Origin, alpha=pooled), size=1) +
  facet_wrap(. ~ AMR, ncol=3) + theme_bw() +
  scale_color_manual(values=c("red","blue","orange","black")) + 
  xlab("AMR prevalence estimate (%)") +
  ylab("Source laboratory") +
  guides(alpha = "none") +
  labs(color="Country:")+ 
  theme(strip.background = element_rect(fill ="#f0f0f5"), strip.text=element_text(size=10, face="bold"), axis.text.x = element_text(angle = 45, hjust = 1, size = 8, colour = "black"), axis.text.y = element_text(size = 8, colour = "black"), legend.title = element_text(size=10, face="bold"), legend.position="bottom")

country_lab_data_SA_amrFreq_estimates

pdf(file=paste0("SuppFig13_country_lab_data_SA_amrFreq_estimates.pdf"),width=8, height=9)
country_lab_data_SA_amrFreq_estimates
dev.off()
## quartz_off_screen 
##                 2
png(file=paste0("SuppFig13_country_lab_data_SA_amrFreq_estimates.png"),width=8, height=9, units="in", res=400)
country_lab_data_SA_amrFreq_estimates
dev.off()
## quartz_off_screen 
##                 2

compare CIs for AMR in South Asia

# lab-specific
lab_level <- SA_amrFreq %>% 
  filter(!(Country_lab_label %in% c("Bangladesh : (Pooled)", "India : (Pooled)", "Nepal : (Pooled)", "Pakistan : (Pooled)")))

pooled <- SA_amrFreq %>% 
  filter(Country_lab_label %in% c("Bangladesh : (Pooled)", "India : (Pooled)", "Nepal : (Pooled)", "Pakistan : (Pooled)")) %>%
  rename(pooled_freq=lab_freq, pooled_min=min, pooled_max=max)

labs_pooled <- left_join(lab_level, pooled, by=c("Country_Origin", "AMR"))

labs_pooled <- labs_pooled %>% 
  filter(n.x >=20)

nrow(labs_pooled)
## [1] 168
labs_pooled_overlap_estimate <- labs_pooled %>%
  filter((lab_freq > pooled_min) & (lab_freq < pooled_max))

nrow(labs_pooled_overlap_estimate)/nrow(labs_pooled)
## [1] 0.2440476
labs_pooled_overlap_interval <- labs_pooled %>%
  filter(!( (max<pooled_min) | (min>pooled_max)))

nrow(labs_pooled_overlap_interval)/nrow(labs_pooled)
## [1] 0.5833333

Compare AMR estimates for Nigeria from different labs

country_lab_data_Nigeria_amrFreq <- tgc_from2010 %>% 
  filter(Country_Origin == "Nigeria") %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="CDC", "*US (CDC)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="PHE", "*England (PHE)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="IBA", "Ibadan (IBD)")) %>%
  mutate(`Isolating Lab` = replace(`Isolating Lab`, `Isolating Lab`=="ZMC", "Abuja (ZMC)")) %>%
  select(`Isolating Lab`, MDR, cipNS, cipR, aziR, CEP, XDR) %>%
  rename(cefR=CEP) %>%
  melt(id.vars=c("Isolating Lab"), variable.name="AMR") %>%
  group_by(`Isolating Lab`, AMR) %>%
  summarise (n=n(),
             x=sum(value),
             .groups = "keep") %>%
  mutate(lab_freq = x/n, 
         lab_freq_sd = sqrt((lab_freq*(1-lab_freq))/n),
         min=lab_freq-1.96*lab_freq_sd,
         max=lab_freq+1.96*lab_freq_sd)

nigeria_amrFreq <- tgc_from2010 %>% 
  select(Country_Origin, MDR, cipNS, cipR, aziR, CEP, XDR) %>%
  rename(cefR=CEP) %>%
  filter(Country_Origin == "Nigeria") %>%
melt(id.vars=c("Country_Origin"), variable.name="AMR") %>%
  group_by(Country_Origin, AMR) %>%
  summarise (n=n(),
             x=sum(value),
             .groups = "keep") %>%
  mutate(lab_freq = x/n, 
         lab_freq_sd = sqrt((lab_freq*(1-lab_freq))/n),
         min=lab_freq-1.96*lab_freq_sd,
         max=lab_freq+1.96*lab_freq_sd) %>%
  mutate(Country_Origin=replace(Country_Origin, Country_Origin=="Nigeria", "Nigeria (Pooled)")) %>%
  rename(`Isolating Lab`=Country_Origin) %>%
  bind_rows(country_lab_data_Nigeria_amrFreq)

Fig 7b: facet estimate plot for AMR freqs in Nigeria labs

country_lab_data_Nigeria_amrFreq_estimates <-
nigeria_amrFreq %>%
  filter(n>=10) %>%
  mutate(pooled=ifelse(`Isolating Lab`=="Nigeria (Pooled)","Pooled","Individual")) %>%
  ggplot() +
  geom_linerange( mapping=aes(y=`Isolating Lab`, xmin=min, xmax=max, colour=pooled)) +
  geom_point(mapping=aes(y=`Isolating Lab`, x=lab_freq, colour=pooled), size=1) +
  facet_wrap(. ~ AMR, ncol=3) + theme_bw() +
  labs(x="", y="", title="b) AMR prevalence, by lab", colour="Estimate type") +
  scale_color_manual(values=c("red","black")) +
  theme(strip.background = element_rect(fill ="#f0f0f5"),
        strip.text=element_text(size=10, face="bold"), 
        axis.text.x = element_text(angle = 45, hjust = 1, size = 6, colour = "black"),
        axis.text.y = element_text(size = 7, colour = "black"),
        legend.title=element_text(size=8, face="bold"), 
        legend.text = element_text(size=8), legend.key.size=unit(.25, "cm"),
        legend.position="right", 
        plot.title=element_text(size = 9, colour = "black", face="bold"))

country_lab_data_Nigeria_amrFreq_estimates

Fig 7: combined fig with annual AMR & genotype for Nigeria

pdf(file=paste0("Fig7_Nigeria_labcomparisons.pdf"),width=5, height=8)
country_lab_data_Nigeria_genotypeFreq_estimates / country_lab_data_Nigeria_amrFreq_estimates / country_facet_lab_Nigeria
dev.off()
## quartz_off_screen 
##                 2
png(file=paste0("Fig7_Nigeria_labcomparisons.png"),width=5, height=8, units="in", res=400)
country_lab_data_Nigeria_genotypeFreq_estimates / country_lab_data_Nigeria_amrFreq_estimates / country_facet_lab_Nigeria
dev.off()
## quartz_off_screen 
##                 2